def applyAggregationModel(testReviews, featureAvgSent, model, busImportantFeatures, userImportantFeatures):
    logger = logging.getLogger('signature.aAM.applyAggregationModel')
    logger.info('starting applyAggregationModel from %d reviews'%len(testReviews))
    fsw = featureStructureWorker()
    
    for r, review in enumerate(testReviews):
        reviewFeatures = review['predSentiments']
        #features = encodeAspects2features(fsw, reviewFeatures)
        features = encodeAspects1features(fsw, reviewFeatures, featureAvgSent)
        #aggregation = model.predict(features)
        #print aggregation, review['stars']
        
        predictedFeatures = review['exPredFeatures']#[*,1]
        
        
        #Predicted Features, Predicted Sentiments by BUSINESS
        busID = review['business_id']
        if busID in busImportantFeatures:
            busSents =  busImportantFeatures[busID]['sentiment']
        else:
            busSents =  {}
        testData = {a:busSents[a][0] for a in busSents 
                    if a in predictedFeatures and predictedFeatures[a][1] == 1 and busSents[a][1] > 1}
        features = encodeAspects1features(fsw, testData, featureAvgSent)
        aggregationBUS = model.predict(features)
        
        review['rating_prediction'] = review.get('rating_prediction', {})
        review['rating_prediction']['aggregBUSavg'] = aggregationBUS
        
        
        if not r%10000:
            logger.debug('%d reviews processed'%r)
    
    return testReviews
Exemplo n.º 2
0
 def __init__(self):
     self.logger = logging.getLogger('signature.aspectDependence')
     self.logger.info('aspectDependence created')
 
     self.fsw = featureStructureWorker()
     
     self.aspectList = [x for x in self.fsw.featureIdicator if self.fsw.featureIdicator[x]]
     self.aspectList.sort()
     
     self.aspectStat = dict()
     for i,x in enumerate(self.aspectList):
         self.aspectStat[x] = pd.DataFrame(np.zeros((4,5)),
                                           index=['n','-1','0','1'],
                                           columns=[1,2,3,4,5])
     
     self.aspectPairStat = dict()
     for i,x in enumerate(self.aspectList):
         for y in self.aspectList[i+1:]:
             self.aspectPairStat[(x,y)] = pd.DataFrame(np.zeros((4,4)),
                                                       index=['n','-1','0','1'],
                                                       columns=['n','-1','0','1'])
     
     #resulting aspect-stars stat
     self.aspectStars = dict()
     #resulting dependence Stat
     self.resultingStat = dict()
Exemplo n.º 3
0
def predictAll(path, modelfile):
    logger = logging.getLogger('signature.pairCompare')
    logger.info('starting pairCompare')
    #get data
    b_file = path+'/businessProfile.json'
    u_file = path+'/userProfile.json'
    r_file = path+'/specific_reviews_test.json'
    
    fsw = featureStructureWorker()
    
    #load model
    modelDict = pickle.load(open(modelfile,'rb'))
    logger.info('Model loaded from %s'%modelfile)
    
    
    busImportantFeatures = json.loads(open(b_file,'r').readline())
    logger.info('Important BUSINESS Features loaded')
    userImportantFeatures = json.loads(open(u_file,'r').readline())
    logger.info('Important USER Features loaded')
    testReviewsByUser = dict()
    for counter, line in enumerate(open(r_file,'r')):
        if not counter%1000:
            logger.debug('%d reviews loaded'%counter)
        
        review = json.loads(line.strip())
        userID = review['user_id']
        
        for aspect in modelDict:
            if not fsw.featureIdicator[aspect]:
                continue
            
            featureSet = calculateFeatures(logger, review, aspect, busImportantFeatures, userImportantFeatures)
            if not featureSet:
                continue
            
            review['pairComp'] = review.get('pairComp', {})
            predProb = modelDict[aspect][1].predict_proba(np.array([featureSet]))[0][1]
            
            if predProb > 0.5:
                predSent = modelDict[aspect][3].predict_proba(np.array([featureSet]))[0][1]
                
                review['pairComp'][aspect] = predSent
            
            #print(review['pairComp'])
            
        testReviewsByUser[userID] = testReviewsByUser.get(userID, [])
        testReviewsByUser[userID].append(review)
    
    logger.info('Reviews loaded')
    
    
    
    
    #save result
    outfile = open(path+'test_predictions.json','w')
    for user in testReviewsByUser:
        outfile.write(json.dumps(testReviewsByUser[user])+'\n')
    outfile.close()
Exemplo n.º 4
0
def learnTopicModel(infileName, dictFile, modelFile, descriptionFile, topic_num = 10):
    logger = logging.getLogger('signature.learnTopicModel')
    logger.info('starting learnTopicModel from %s'%infileName)
    fsw = featureStructureWorker()
    
    texts = list()
    #build corpus
    review_file = open(infileName,"r")
    for counter, line in enumerate(review_file):
        if not counter%10000:
            logger.debug('%d reviews loaded'%counter)
            
        # load review information
        review = json.loads(line.strip())
        reviewFeatures = fsw.getReviewFeaturesExistence(review['features'])
        
        text_plus = list()
        text_minus = list()
        
        for aspect in reviewFeatures:
            sent = np.average(reviewFeatures[aspect])
            if sent > 0:
                text_plus.append(aspect)
            elif sent < 0:
                text_minus.append(aspect)
        
        if len(text_plus):
            texts.append(text_plus)
        if len(text_minus):
            texts.append(text_minus)
        
    
    
    #build Dictionary
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=100, no_above=0.8)
    logger.info(dictionary)

    corpus_int = [dictionary.doc2bow(text) for text in texts]
    
    logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',datefmt='%m-%d,%H:%M:%S', level=logging.INFO)
    lda_model = models.ldamodel.LdaModel(corpus=corpus_int, id2word=dictionary, num_topics=topic_num, update_every=1, chunksize=10000, passes=30)
    lda_model.print_topics(20)
    
    output = open(descriptionFile,"w")
    for i, topic in enumerate(lda_model.show_topics(num_topics=100, num_words=15, log=False, formatted=True)):
        #print str(i)+"\t"+topic.encode("utf8")
        try:
            output.write(str(i)+"\t"+topic.decode('utf8', 'ignore')+"\n\n")
        except:
            try:
                output.write(str(i)+"\t"+topic[:30].decode('utf8', 'ignore')+"\n\n")
            except:
                output.write(str(i)+"\t"+"\n\n")
    output.close()
    
    dictionary.save(dictFile)
    lda_model.save(modelFile)
def learnAggregationModelsCV(trainReviews, featureAvgSent, busImportantFeatures, path):
    logger = logging.getLogger("signature.lAMCV.learnAggregationModelsCV")
    logger.info("starting learnAggregationModel from %d reviews" % len(trainReviews))
    fsw = featureStructureWorker()

    learnData = list()
    learnLabels = list()
    for j, review in enumerate(trainReviews):
        reviewFeatures = fsw.getReviewFeaturesSentiment(review["features"])
        rating = review["stars"]
        for aspect in reviewFeatures:
            if (
                review["business_id"] in busImportantFeatures
                and aspect in busImportantFeatures[review["business_id"]]["sentiment"]
                and busImportantFeatures[review["business_id"]]["sentiment"][aspect][1] > 5
            ):
                sentiment = busImportantFeatures[review["business_id"]]["sentiment"][aspect][0]
                reviewFeatures[aspect] = sentiment
            else:
                reviewFeatures[aspect] = featureAvgSent[aspect]

        # features = encodeAspects2features(fsw, reviewFeatures)
        features = encodeAspects1features(fsw, reviewFeatures, featureAvgSent)
        learnData.append(features)
        learnLabels.append(rating)

    learnData = np.array(learnData)
    learnLabels = np.array(learnLabels)

    bestRes = 0.0
    bestReg = 0.0
    for reg in [0.01, 0.05, 0.1, 0.2, 0.5, 1.0, 5.0, 10, 15, 50, 100, 200, 500]:
        kf = cross_validation.KFold(len(learnLabels), n_folds=10)
        results = list()
        for train_index, test_index in kf:
            X_train, X_test = learnData[train_index], learnData[test_index]
            y_train, y_test = learnLabels[train_index], learnLabels[test_index]
            clf = linear_model.Ridge(alpha=reg)
            clf.fit(X_train, y_train)
            results.append(clf.score(X_test, y_test))
        if np.average(results) > bestRes:
            bestRes = np.average(results)
            bestReg = reg
        # print reg, np.average(results)
    logger.info("Best score %f with regularization = %.2f" % (bestRes, bestReg))

    clf = linear_model.Ridge(alpha=bestReg)
    clf.fit(learnData, learnLabels)

    return clf
Exemplo n.º 6
0
def matchProfiles(path, limit = np.Inf):
    logger = logging.getLogger('signature.matchProfiles')
    logger.info('starting matchProfiles')
    #get data
    b_file = path+'businessFeaturesAggregation_train.json'
    u_file = path+'userFeaturesAggregation_train.json'
    r_file = path+'yelp_reviews_test_predictions.json'
    
    busImportantFeatures = json.loads(open(b_file,'r').readline())
    logger.info('Important BUSINESS Features loaded')
    userImportantFeatures = json.loads(open(u_file,'r').readline())
    logger.info('Important USER Features loaded')
    fsw = featureStructureWorker()
    
    #load featureWeights
    infile = open(path+'featureWeights.json','r')
    featureWeights = json.loads(infile.readline().strip())
    infile.close()
    
    nums = list()
    reviews = list()
    for counter, line in enumerate(open(r_file,'r')):
        if not counter%1000:
            logger.debug('%d reviews loaded'%counter)
        if counter > limit:
            break
        review = json.loads(line.strip())
        userID = review['user_id']
        busID = review['business_id']
        if busID in busImportantFeatures and userID in userImportantFeatures:
            num, score = matchBUprofiles(fsw, featureWeights,
                                              busImportantFeatures[busID], 
                                              userImportantFeatures[userID])
            nums.append(num)
        else:
            score = -1000000
        
        review['rating_prediction'] = review.get('rating_prediction', {})
        review['rating_prediction']['match'] = score
        
        reviews.append(review)
        
    outfile = open(path+'yelp_reviews_test_predictions.json','w')    
    for review in reviews:
        outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n')
    outfile.close()
    #print nums
    #print len(nums)
    print 'AVERAGE NUMBER OF FEATURES = %f'%np.average(num)
Exemplo n.º 7
0
def featureAggregation(review_list, ignore_neutral = True):
    fsw = featureStructureWorker()
    aggregation_dict = dict()
    for review in review_list:
        reviewFeatures = fsw.getReviewFeatures(review)
        for feature in reviewFeatures:
            aggregation_dict[feature] = aggregation_dict.get(feature,[])
            if ignore_neutral:
                aggregation_dict[feature].append(np.average([x for x in reviewFeatures[feature] if x]))
            else:
                aggregation_dict[feature].append(np.average(reviewFeatures[feature]))
    
    for feature in aggregation_dict:
        aggregation_dict[feature] = [round(np.average(aggregation_dict[feature]),3),
                                     len(aggregation_dict[feature])]
    
    return copy.deepcopy(aggregation_dict)
Exemplo n.º 8
0
def featureStat(infileName, outfileName, limit = 1000):
    #load reviews
    review_file = open(infileName,"r")
    stat = [[],[],[]]
    feature_stat = dict()
    fsw = featureStructureWorker()
    for counter, line in enumerate(review_file):
        if counter > limit:
            break
        review = json.loads(line)
        #features = fsw.getFeatureAverage(review['features'])
        features = fsw.getReviewFeaturesExistence(review['features'])
        stat[0].append(review['ID'])
        stat[1].append(review['business_id'])
        stat[2].append(review['user_id'])
        for feature in features:
            feature_stat[feature] = feature_stat.get(feature,[[],[],[]])
            feature_stat[feature][0].append(review['ID'])
            feature_stat[feature][1].append(review['business_id'])
            feature_stat[feature][2].append(review['user_id'])
        if not counter %1000:
            print '%d reviews processed'%counter
        
        
    out_file = open(outfileName,"w")
    out_file.write('%30s\t%s\t%s\t%s\t%s\t%s\t%s\n'%('FeatureName','rewFreq','busFreq','usFreq',
                                         'rewPer','busPer','userPer'))
    
    result = []
    for i in range(len(stat)):
        stat[i] = float(len(set(stat[i])))/100
     
    for feature in feature_stat:
        for i in range(len(feature_stat[feature])):
            feature_stat[feature][i] = len(set(feature_stat[feature][i]))
        f = feature_stat[feature]
        s = '%30s\t%5d\t%5d\t%5d\t%5.1f\t%5.1f\t%5.1f\n'%(feature,f[0],f[1],f[2],
                                                    f[0]/stat[0],f[1]/stat[1],f[2]/stat[2])
        result.append([f[0]/stat[0],s])
        
    result.sort(reverse=True)
    for r in result:
        out_file.write(r[1])
            
    review_file.close()
    out_file.close()
Exemplo n.º 9
0
def matchBUprofiles(fsw, featureWeights, busProfile, userProfile):
    fsw = featureStructureWorker()
    
    matchScore = 0.0
    num = 0
    for feature in fsw.featureIdicator:
        if not fsw.featureIdicator[feature]:
            continue
        
        if feature not in userProfile['featureFreq'] and feature not in busProfile['featureFreq']:
            continue
        
        if userProfile['featureFreq'].get(feature,0) > 10 and userProfile['sentiment'].get(feature,[0,0])[1] > 1:
            pass
        else:
            continue
        
        if busProfile['featureFreq'].get(feature,0) > 10 and busProfile['sentiment'].get(feature,[0,0])[1] > 5:
            pass
        else:
            continue
            
        
        
        
        sentiment = busProfile['sentiment'][feature][0]
#        userImp = userProfile['tfidfDict'].get(feature,0.0)
#        busImp = busProfile['tfidfDict'].get(feature,0.0)
        userImp = userProfile['featureFreq'].get(feature,0.0)
        busImp = busProfile['featureFreq'].get(feature,0.0)
        weight = featureWeights[feature]
        coeff = 1.0
        
        if sentiment < weight:
            coeff = 2.0
        
#        if busImp > 80:
#            userImp = max(userImp,busImp)
        
        
        matchScore += userImp*busImp*sentiment*weight*coeff
        num += 1.0
    return num, matchScore
Exemplo n.º 10
0
def learnFeatureExistance(busImportantFeatures, userImportantFeatures, trainReviews, path):
    logger = logging.getLogger('signature.lFE.learnFE')
    logger.info('starting learnFeatureExistance from %d reviews'%len(trainReviews))
    fsw = featureStructureWorker()
    modelDict = dict()
    trainAveragesDict = dict()
    
    
    for f, feature in enumerate(fsw.featureIdicator):
        if not fsw.featureIdicator[feature]:
            continue
        logger.debug('Start working with %s'%feature)
        
        #get data
        X, Y, trainAveragesDict[feature] = getFeatures(logger, feature, trainReviews, busImportantFeatures, userImportantFeatures,
                                          trainAverages = {}, is_train = True)
        logger.debug('Got features for %d reviews'%len(X))
#        #cross validation
#        indicator = range(len(X))
#        random.shuffle(indicator)
#        thres = int(len(indicator)*0.8)
#        trainX = np.array([X[i] for i in indicator[:thres]])
#        trainY = np.array([Y[i] for i in indicator[:thres]])
#        testX = np.array([X[i] for i in indicator[thres:]])
#        testY = np.array([Y[i] for i in indicator[thres:]])
        
        #Logistic Regression
        #bestThres,bestF1,logmodel = getBestLogModel(logger, feature, trainX, trainY, testX, testY, X, Y, path)
        bestThres,bestF1,logmodel = getBestLogModel(logger, feature, X, Y, path)
        #bestThresSVM,bestF1SVM,svmmodel = getBestSVMModel(logger, feature, X, Y, path)
        
#       crossValidation(logger, np.array(X), np.array(Y))
        
        
        modelDict[feature] = [bestThres,bestF1,logmodel]
        
#        print f
#        if f > 6:
#            break
        
    return trainAveragesDict, modelDict
Exemplo n.º 11
0
def learnTopicExistence(busImportantFeatures, userImportantFeatures, trainReviews, path):
    logger = logging.getLogger('signature.lTE.learnTopicExistence')
    logger.info('starting learnTopicExistence from %d reviews'%len(trainReviews))
    fsw = featureStructureWorker()
    modelDict = dict()
    
    for f, topic in enumerate(fsw.featureIdicator):
        if not fsw.featureIdicator[topic]:
            continue
        logger.debug('Start working with %s'%topic)
        
        #get data
        X, Y = getFeatures(logger, topic, trainReviews, busImportantFeatures, userImportantFeatures,
                                          trainAverages = {}, is_train = True)
        logger.debug('Got features for %d reviews'%len(X))
        
        
        modelDict[topic] = getBestModel(logger, topic, X, Y, path)
    
    #print modelDict
    return modelDict
Exemplo n.º 12
0
def learnSentimentMatrixFactorization(trainReviews, path):
    logger = logging.getLogger('signature.lSMF.learnSentimentMF')
    logger.info('starting learnSentimentMatrixFactorization from %d reviews'%len(trainReviews))
    fsw = featureStructureWorker()
    modelDict = dict()
    featureThres = dict()
    for i, feature in enumerate(fsw.featureIdicator):
#        if feature != 'STAFF':
#            continue
        if not fsw.featureIdicator[feature]:
            continue
        
        logger.debug('Start working with %s'%feature)
        
        learnData = {'user':[],'item':[],'rating':[]}
        
        for j, review in enumerate(trainReviews):
            reviewFeatures = fsw.getReviewFeaturesSentiment(review['features'])
            if feature not in reviewFeatures:
                continue
            
            busID = review['business_id']
            userID = review['user_id']
            rating = np.average(reviewFeatures[feature])
#            if rating == 0.0:
#                continue
#            rating = 1.0 if rating > 0 else -1.0
            learnData['user'].append(userID)
            learnData['item'].append(busID)
            learnData['rating'].append(rating)
            
            
            
        #CROSSS VALIDATION
        data = graphlab.SFrame(learnData)
        featureThres[feature], modelDict[feature] = getBestMFThres(logger, feature, data, path)

    
    return modelDict, featureThres
Exemplo n.º 13
0
def learnAggregationModelsCV(trainReviews, featureAvgSent, path):
    logger = logging.getLogger('signature.lAMCV.learnAggregationModelsCV')
    logger.info('starting learnAggregationModel from %d reviews'%len(trainReviews))
    fsw = featureStructureWorker()
    
    learnData = list()
    learnLabels = list()
    for j, review in enumerate(trainReviews):
        reviewFeatures = fsw.getReviewFeaturesSentiment(review['features'])
        rating = review['stars']
        #features = encodeAspects2features(fsw, reviewFeatures)
        features = encodeAspects1features(fsw, reviewFeatures, featureAvgSent)
        learnData.append(features)
        learnLabels.append(rating)
    
    learnData = np.array(learnData)
    learnLabels = np.array(learnLabels)
    
    bestRes = 0.0
    bestReg = 0.0
    for reg in [0.01,0.05,0.1,0.2,0.5,1.0,5.0,10,15,50,100,200,500]:
        kf = cross_validation.KFold(len(learnLabels), n_folds=10)
        results = list()
        for train_index, test_index in kf:
            X_train, X_test = learnData[train_index], learnData[test_index]
            y_train, y_test = learnLabels[train_index], learnLabels[test_index]
            clf = linear_model.Ridge(alpha = reg)
            clf.fit (X_train, y_train)
            results.append(clf.score(X_test, y_test))
        if np.average(results) > bestRes:
            bestRes = np.average(results)
            bestReg = reg
        #print reg, np.average(results)
    logger.info('Best score %f with regularization = %.2f'%(bestRes, bestReg))
    
    clf = linear_model.Ridge(alpha = bestReg)
    clf.fit(learnData, learnLabels)
    
    return clf
Exemplo n.º 14
0
def applySMF(path, limit = np.Inf):
    logger = logging.getLogger('signature.applySentimentMF')
    logger.info('starting applySentimentMF')
    #get data
    r_file = path+'specific_reviews_test.json'
    
    testReviews = list()
    for counter, line in enumerate(open(r_file,'r')):
        if not counter%5000:
            logger.debug('%d reviews loaded'%counter)
        if counter > limit:
            break
        testReviews.append(json.loads(line.strip()))
    logger.info('Test Reviews loaded from %s'%r_file)
    
    
    #load model existence
    modelDict_ex = dict()
    featureThres_ex = dict()
    fsw = featureStructureWorker()
    for feature in fsw.featureIdicator:
        if not fsw.featureIdicator[feature]:
            continue
        try:
            modelPath = path + 'sentimentModels/%s_sentiment_ex.model'%feature
            modelDict_ex[feature] = graphlab.load_model(modelPath)
            
            #load average
            thres_path = path+'sentimentModels/%s_sentiment_ex.threshold'%feature
            infile = open(thres_path,'r')
            featureThres_ex[feature] = float(infile.readline().strip())
            infile.close()
        except:
            logger.error('There is no model for feature: %s'%feature)
            continue
        
    logger.info('Existence Models loaded')
    
    
    #load model
    modelDict = dict()
    featureThres = dict()
    fsw = featureStructureWorker()
    for feature in fsw.featureIdicator:
        if not fsw.featureIdicator[feature]:
            continue
        try:
            modelPath = path + 'sentimentModels/%s_sentiment.model'%feature
            print modelPath
            modelDict[feature] = graphlab.load_model(modelPath)
            
            #load average
            thres_path = path+'sentimentModels/%s_sentiment.threshold'%feature
            infile = open(thres_path,'r')
            featureThres[feature] = float(infile.readline().strip())
            infile.close()
        except:
            logger.error('There is no model for feature: %s'%feature)
            continue
        
    logger.info('Sentiment Models loaded')
        
    #run function
    results_ex, results = applySentimentMF(testReviews, modelDict_ex, featureThres_ex, modelDict, featureThres)
    
    #save result
    json.dump(results_ex,open(path+'reviews_test_exMFpred.json','w'))
    json.dump(results,open(path+'reviews_test_MFpred.json','w'))
Exemplo n.º 15
0
def getFeatures(logger, feature, reviewsSet, busImportantFeatures, userImportantFeatures):
    
    #business_dict, user_dict = loadData(logger)
    
    fsw = featureStructureWorker()
    X1 = list()
    Y1 = list()
    X2 = list()
    Y2 = list()
    
    missed = 0
    
    for review in reviewsSet:
        feature_set = calculateFeatures(logger, review, feature, busImportantFeatures, userImportantFeatures)
        
        
        reviewFeatures = fsw.getReviewFeaturesExistence(review['features'])
#            
#        busID = review['business_id']
#        userID = review['user_id']
#        if busID not in busImportantFeatures or userID not in userImportantFeatures:
#            missed += 1
#            continue
#        
#        bus_tfidf = busImportantFeatures[busID]['tfidfDict'].get(feature,0.0)
#        bus_freq = busImportantFeatures[busID]['featureFreq'].get(feature,0.0)/100.0
#        bus_reviews = busImportantFeatures[busID]['reviewsNumber']
#        bus_sentiment = (busImportantFeatures[review['business_id']]['sentiment'].get(feature,[0.0,0])[0]+1)/2.0
#        
#        user_tfidf = userImportantFeatures[userID]['tfidfDict'].get(feature,0.0)
#        user_freq = userImportantFeatures[userID]['featureFreq'].get(feature,0.0)/100.0
#        user_reviews = userImportantFeatures[userID]['reviewsNumber']
#        user_sentiment = (userImportantFeatures[review['user_id']]['sentiment'].get(feature,[0.0,0])[0]+1)/2.0
#        user_text = userImportantFeatures[userID]['textFeatures']
#        
#        '''CHECK IF WE HAVE ENOUGH INFORMATION'''
#        if bus_reviews > 5 and bus_freq > 0.1 and user_reviews > 5: # 5 1 5
#            feature_set = [bus_tfidf,  bus_freq, bus_sentiment,
#                           user_tfidf, user_freq, user_sentiment]
##            feature_set = [bus_freq]
#            
#            feature_set += getCriticalFeatures(feature, busID, busImportantFeatures)
#            feature_set += getCriticalFeatures(feature, userID, userImportantFeatures)
#            
##            feature_set += user_text
        if feature_set:
            
            if feature in reviewFeatures:
                Y1.append(1)
                X1.append(feature_set)
                
                sent = np.average(reviewFeatures[feature])
                if sent > 0:
                    Y2.append(1)
                    X2.append(feature_set)
                elif sent < 0:
                    Y2.append(0)
                    X2.append(feature_set)
            else:
                Y1.append(0)
                X1.append(feature_set)
        else:
            missed += 1
            
    return X1, Y1, X2, Y2, missed
Exemplo n.º 16
0
def applyTopicModel(logger, path, topic_num):
    stat_file = path+'yelp_reviews_features_stat.json'
    train_file = path+'yelp_reviews_features_train.json'
    extrain_file = path+'yelp_reviews_features_extrain.json'
    test_file = path+'yelp_reviews_features_test.json'
    
    #load model
    model_path = path+'modelLDA/'
    dictionary = corpora.Dictionary.load(model_path+'dictionary_%d.lda'%topic_num)
    logger.info("Dictionary loaded from: "+ model_path+'dictionary_%d.lda'%topic_num)

    lda_model = models.ldamodel.LdaModel.load(model_path+'model_%d.lda'%topic_num)
    logger.info("Model loaded from:" + model_path+'model_%d.lda'%topic_num)
    
    
    files = [stat_file,train_file,extrain_file,test_file]
    fsw = featureStructureWorker()
    
    for infile in files:
        reviews = list()
        for counter, line in enumerate(open(infile,'r')):
            if not counter%10000:
                logger.debug('%d reviews loaded'%counter)
            #print infile, line
            # load review information
            review = json.loads(line.strip())
            reviews.append(review)
            
#        outfile = open(infile.replace('.json','_old.json'),'w')
#        for review in reviews:
#            outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n')
#            outfile.close()
        
        
#        outfile = open(infile,'w')
#        outname = infile.replace('.json','_old.json')
        outname = infile
        print outname
        outfile = open(outname,'w')
        for counter, review in enumerate(reviews):
            if not counter%1000:
                logger.debug('%d reviews loaded'%counter)
            
            if 'features_sent' in review:
                reviewFeatures = fsw.getReviewFeaturesExistence(review['features_sent'])
            else:
                reviewFeatures = fsw.getReviewFeaturesExistence(review['features'])
            
            text_plus = list()
            text_minus = list()
            
            for aspect in reviewFeatures:
                sent = np.average(reviewFeatures[aspect])
                if sent > 0:
                    text_plus.append(aspect)
                elif sent < 0:
                    text_minus.append(aspect)
            
            
            topics_plus = lda_model[dictionary.doc2bow(text_plus)]
            topics_minus = lda_model[dictionary.doc2bow(text_minus)]
            
            res = dict()
            if len(topics_plus):
                res['1'] = topTopics(topics_plus)
            #print topics_plus, res['1']
            
            if len(topics_minus):
                res['0'] = topTopics(topics_minus,sign = -1)
            
            if 'features_sent' not in review:
                review['features_sent'] = review['features'].copy()
            review['features'] = res.copy()
            
            
            outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n')
            
#            if counter > 10:
#                break
        outfile.close()
Exemplo n.º 17
0
def learnAndApplyMatching(path, limit = np.Inf):
    logger = logging.getLogger('signature.learnAndApplyMatching')
    logger.info('starting learnAndApplyMatching')
    #get data
    b_file = path+'businessFeaturesAggregation_stat.json'
    u_file = path+'userFeaturesAggregation_stat.json'
    train_file = path+'yelp_reviews_features_extrain.json'
    test_file = path+'yelp_reviews_test_predictions.json'
    
    busImportantFeatures = json.loads(open(b_file,'r').readline())
    logger.info('Important BUSINESS Features loaded')
    userImportantFeatures = json.loads(open(u_file,'r').readline())
    logger.info('Important USER Features loaded')
    fsw = featureStructureWorker()
    
#    #load featureWeights
#    infile = open(path+'/featureWeights.json','r')
#    featureWeights = json.loads(infile.readline().strip())
#    infile.close()
    
    #learn model
    learnData = list()
    learnLabels = list()
    for counter, line in enumerate(open(train_file,'r')):
        if not counter%1000:
            logger.debug('%d reviews loaded'%counter)
        if counter > limit:
            break
        review = json.loads(line.strip())
        userID = review['user_id']
        busID = review['business_id']
        
        features = getFeatures(busID, userID, busImportantFeatures, userImportantFeatures, fsw)
        if features:
            learnData.append(features)
            learnLabels.append(review['stars'])
    
    model = learnMatchModel(logger, learnData, learnLabels)
    print model.coef_
    for aspect in fsw.featureIdicator:
        if not fsw.featureIdicator[aspect]:
            continue
        print aspect
#    print model
#    exit()
    
    #apply model
    testReviews = []
    for counter, line in enumerate(open(test_file,'r')):
        if not counter%1000:
            logger.debug('%d reviews loaded'%counter)
        if counter > limit:
            break
        review = json.loads(line.strip())
        testReviews.append(review)
        
    
    outfile = open(path+'yelp_reviews_test_predictions.json','w')
    for counter, review in enumerate(testReviews):
        if not counter%1000:
            logger.debug('%d reviews loaded'%counter)
        userID = review['user_id']
        busID = review['business_id']
        
        test_features = getFeatures(busID, userID, busImportantFeatures, userImportantFeatures, fsw)
        if test_features:
            prediction = model.predict(test_features)
        else:
            prediction = None
        
        review['rating_prediction'] = review.get('rating_prediction', {})
        review['rating_prediction']['match_prediction'] = prediction
        
        outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n')
    outfile.close()
    
    
Exemplo n.º 18
0
def computeStatWorker(testReviews, predType, path, modelDict, classes = [0, 1]):
    logger = logging.getLogger('signature.computeStat.cSW')
    logger.info('start computing Statistic from %d reviews for %s'%(len(testReviews), predType))
    fsw = featureStructureWorker()
    
    try:
        os.stat(path+'results/')
    except:
        os.mkdir(path+'results/')
    output = open(path+'results/example_%s_%d.txt'%(predType,classes[1]), 'w')
    
    
    Jaccard = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
    Jaccard_int = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
    
    Jaccard_vector = dict()# thres -> values
    Accuracy_vector = dict()
    
    Presision = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
    Recall    = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
    F1 = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
    
    
    Presision_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
    Recall_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
    F1_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
    
    TP_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
    FP_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
    FN_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
    
    
    
    
#    RMSE = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
#    RMSE_o = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
    
    
    aspectNumAvg = {'good':[], 0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
    y_true = list()
    y_pred_list = [[],[],[],[],[],[]]
    
    for r, review in enumerate(testReviews):
        Jaccard_intersection = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
        Jaccard_union        = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
        
        Jaccard_vector_review = dict()
        Accuracy_vector_review = dict()
        for thres in np.arange(-0.05,1.05,0.05):
            Jaccard_vector_review[thres] = Jaccard_vector_review.get(thres, {1:[0,0], 2:[0,0], 3:[0,0], 4:[0,0],5:[0,0], 6:[0,0]})
            Accuracy_vector_review[thres] = Accuracy_vector_review.get(thres, {1:[0,0], 2:[0,0], 3:[0,0], 4:[0,0],5:[0,0], 6:[0,0]})
        
        
        TP = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
        FP = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
        FN = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
        
#        RMSE_review = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
        
        aspectNum = {'good':0.0, 0:0.0, 1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0}
        if predType in review:
            aspectNum['good'] = len([aspect for aspect in review[predType] if fsw.featureIdicator[aspect]])
        

        if predType not in review:
            continue
#        print(review['exPredFeatures'])
        for feature in review[predType]:
            if not fsw.featureIdicator[feature]:
                continue
            
            for i in range(0,7):
                if abs(classes[1] - review[predType][feature][i]) < 0.5:
                    aspectNum[i] += 1
                
            
            #for plots
            y_true.append(int(review[predType][feature][0] == classes[1]))
            for i in range(1,7):
                y_pred_list[i-1].append(abs(classes[0] - review[predType][feature][i]))
                
            
            #for computing quality
            for i in range(1,7):
                realClass = review[predType][feature][0]
                if review[predType][feature][i] > 0.5:
                    predictedClass = 1
                else:
                    predictedClass = 0
                
                if realClass == classes[1]:
                    if predictedClass == classes[1]:
                        TP[i] += 1
                        TP_o[i] += 1
                    elif predictedClass == classes[0]:
                        FN[i] += 1
                        FN_o[i] += 1
                elif realClass == classes[0]:
                    if predictedClass == classes[1]:
                        FP[i] += 1
                        FP_o[i] += 1
                
                
                if realClass == classes[1] and predictedClass == classes[1]:
                    Jaccard_intersection[i] += 1
                
                if realClass == classes[1] or  predictedClass == classes[1]:
                    Jaccard_union[i] += 1
                
#                dif = pow(realClass - review[predType][feature][i], 2)
#                RMSE[i].append(dif)
#                RMSE_review[i].append(dif)
            
            '''
            Jaccard_vector
            '''
            for thres in np.arange(-0.05,1.05,0.05):
                for i in range(1,7):
                    if review[predType][feature][i] > thres:
                        predictedClass = 1
                    else:
                        predictedClass = 0
                    
                    if realClass == classes[1] and predictedClass == classes[1]:
                        Jaccard_vector_review[thres][i][0] += 1.0
                    
                    if realClass == classes[1] or  predictedClass == classes[1]:
                        Jaccard_vector_review[thres][i][1] += 1.0
                    
                    Accuracy_vector_review[thres][i][1] += 1.0    
                    if realClass == predictedClass:
                        Accuracy_vector_review[thres][i][0] += 1.0
            
            
        for i in range(1,7):
            if Jaccard_union[i]:
                Jaccard[i].append(Jaccard_intersection[i]/Jaccard_union[i])
                Jaccard_int[i].append(Jaccard_intersection[i])
                if i == 1:
                    if Jaccard[1][-1] > 0.8:
                        if 'sentPredFeatures' in review: 
                            output.write(str(review['sentences'])+'\n--\n'+str(review[predType])+'\n--\n'+str(review['sentPredFeatures'])+'\n====================\n\n')
            
            pre = 0.0
            rec = 0.0
            f1 = 0.0
            
            if (TP[i] + FN[i]):
                if (TP[i] + FP[i]):
                    pre = float(TP[i]) / (TP[i] + FP[i])
                else:
                    pre = 0.0
                rec = float(TP[i]) / (TP[i] + FN[i])
                if pre + rec:
                    f1 = 2 * pre * rec / (pre + rec)
                else:
                    f1 = 0.0
                
                Presision[i].append(pre)
                Recall[i].append(rec)
                F1[i].append(f1)
        
        
        
        '''
        Jaccard_vector
        '''
        for thres in np.arange(-0.05,1.05,0.05):
            Jaccard_vector[thres] = Jaccard_vector.get(thres, {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]})
            Accuracy_vector[thres] = Accuracy_vector.get(thres, {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]})
            
            for i in range(1,7):
                if Jaccard_vector_review[thres][i][1]:
                    Jaccard_vector[thres][i].append(Jaccard_vector_review[thres][i][0]/Jaccard_vector_review[thres][i][1])
                if Accuracy_vector_review[thres][i][1]:
                    Accuracy_vector[thres][i].append(Accuracy_vector_review[thres][i][0]/Accuracy_vector_review[thres][i][1])
        
        #print(aspectNum) 
        for r in aspectNum:
            aspectNumAvg[r].append(aspectNum[r])    
        
#        for i in range(1,4):
#            print(i, Jaccard_vector_review[0.5][i][0],Jaccard_vector_review[0.5][i][1],Jaccard_vector_review[0.5][i][0]/Jaccard_vector_review[0.5][i][1],Jaccard_intersection[i],Jaccard_union[i],Jaccard_intersection[i]/Jaccard_union[i])
#            print(len(Jaccard_vector[0.5][i]), len(Jaccard[i]), np.average(Jaccard_vector[0.5][i]), np.average(Jaccard[i]))
        
#        for r in RMSE_review:
#            if len(RMSE_review[r]):
#                RMSE_o[i].append(np.average(RMSE_review[r]))
        
        
        
#    print(TP_o)
    for i in range(1,7):
        Presision[i] = np.average(Presision[i])
        Recall[i] = np.average(Recall[i])
        F1[i] = np.average(F1[i])
        
        if (TP_o[i] + FP_o[i]):
            Presision_o[i] = float(TP_o[i]) / (TP_o[i] + FP_o[i])
        if (TP_o[i] + FN_o[i]):
            Recall_o[i] = float(TP_o[i])/ (TP_o[i] + FN_o[i])
        if Presision_o[i]+Recall_o[i]:
            F1_o[i] = 2 * Presision_o[i]* Recall_o[i] / (Presision_o[i]+Recall_o[i])
        
#        RMSE_o[i] = np.average(RMSE_o[i])
    
    PreRec = json.dumps([Presision, Recall, F1, Presision_o, Recall_o, F1_o])
    drawPR(y_true,y_pred_list, predType+' %d'%classes[1], path, classes)
    
    drawROC(y_true,y_pred_list, predType+' %d'%classes[1], path)
    
    
    
    J_v = {'th':[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
    A_v = {'th':[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
    
    for thres in np.arange(-0.05,1.05,0.05):
        J_v['th'].append(thres)
        A_v['th'].append(thres)
        for i in range(1,7):
            J_v[i].append(np.average(Jaccard_vector[thres][i]))
            A_v[i].append(np.average(Accuracy_vector[thres][i]))
#            if thres == 0.5:
#                print(i,thres,np.average(Jaccard_vector[thres][i]))
#                print(i,np.average(Jaccard[i]))
    
    drawJacAcc(J_v, A_v, predType+' %d'%classes[1], path)
    
    drawJaccDist(Jaccard, Jaccard_int, predType+' %d'%classes[1], path)
    
    for r in aspectNumAvg:
        aspectNumAvg[r] = np.average(aspectNumAvg[r])
    
    for i1 in range(1,7):
        for i2 in range(i1+1,7):
            print(i1,i2,stats.ttest_ind(Jaccard[i1],Jaccard[i2]))
    J = [np.average(Jaccard[i]) for i in range(1,7)]
    J_int = [np.average(Jaccard_int[i]) for i in range(1,7)]
    
#    RMSE_final = [np.average(RMSE[i]) for i in range(1,7)]
#    RMSE_o_final = [np.average(RMSE_o[i]) for i in range(1,7)]
#    RMSE = [RMSE_final,RMSE_o_final]
#    
#    print(RMSE)
    
    return J, J_int, PreRec, aspectNumAvg#, RMSE
Exemplo n.º 19
0
def featureImportance(review_dict, ignore_neutral = True):
    logger = logging.getLogger('signature.importantFeatureIdentification.featureImportance')
    logger.info('starting featureImportance')
    fsw = featureStructureWorker()
    N = len(review_dict)
    featuresDF = dict() # dictionary for counting Document Frequency
    itemFeatures = dict() # main dictionary with statistics (item:stat)
    for it, item in enumerate(review_dict):
        itemFeatures[item] = {'tfidfDict':{},'featureFreq':{},'sentiment':{},
                              'reviewsNumber':0, 'maxFreq':0, 'textFeatures':[],
                              'critical':[],'texts':{}}
        itemFeatures[item]['reviewsNumber'] = len(review_dict[item])
        
        critical = {}
        
        for r, review in enumerate(review_dict[item]):
            reviewFeatures = fsw.getReviewFeaturesExistence(review['features'])
            #fill in texts
            for sentId in review['features']:
                for feat in review['features'][sentId]:
                    itemFeatures[item]['texts'][feat] = itemFeatures[item]['texts'].get(feat,[])
                    itemFeatures[item]['texts'][feat].append([review['features'][sentId][feat],
                                                              review['sentences'][int(sentId)]])
            #print reviewFeatures
            
            for feature in fsw.featureIdicator:
                if not fsw.featureIdicator[feature]:
                    continue
                critical[feature] = critical.get(feature,{'+':[],'-':[],'0':[],'n':[],'1':[]})
                if feature not in reviewFeatures:
                    critical[feature]['n'].append(review['stars'])
                else:
                    critical[feature]['1'].append(review['stars'])
                    sent = np.average(reviewFeatures[feature])
                    if sent > 0:
                        critical[feature]['+'].append(review['stars'])
                    elif sent < 0:
                        critical[feature]['-'].append(review['stars'])
                    else:
                        critical[feature]['0'].append(review['stars'])
            
            for feature in reviewFeatures:
                #work with frequency
                itemFeatures[item]['featureFreq'][feature] = itemFeatures[item]['featureFreq'].get(feature,0)
                itemFeatures[item]['featureFreq'][feature] += 1
                #work with sentiment
                itemFeatures[item]['sentiment'][feature] = itemFeatures[item]['sentiment'].get(feature,[])
                
                
                
                if len(reviewFeatures[feature]):
                    if ignore_neutral:
                        arr = [x for x in reviewFeatures[feature] if x]
                        if len(arr):
                            itemFeatures[item]['sentiment'][feature].append(np.average(arr))
                        else:
                            itemFeatures[item]['sentiment'][feature].append(0.0)
                    else:
                        itemFeatures[item]['sentiment'][feature].append(np.average(reviewFeatures[feature]))
                else:
                    logger.error('WHY???')
                    itemFeatures[item]['sentiment'][feature].append(0.0)
            
            
            #print review.keys()
            if not len(itemFeatures[item]['textFeatures']):
                for tf in review['textFeatures']:
                    itemFeatures[item]['textFeatures'].append(tf)
            else:
                for i,tf in enumerate(review['textFeatures']):
                    itemFeatures[item]['textFeatures'][i] += tf
            
#            if not r%10:
#                logger.debug('%d reviews'%r)
            
                    
        for feature in itemFeatures[item]['featureFreq']:
            #work with frequency
            if itemFeatures[item]['featureFreq'][feature] > itemFeatures[item]['maxFreq']:
                itemFeatures[item]['maxFreq'] = itemFeatures[item]['featureFreq'][feature]
            #work with sentiment
            itemFeatures[item]['sentiment'][feature] = [round(np.average(itemFeatures[item]['sentiment'][feature]),3),
                                     len(itemFeatures[item]['sentiment'][feature])]
            #work with 'Document' Frequency (DF)
            featuresDF[feature] = featuresDF.get(feature, 0)
            featuresDF[feature] += 1
        
        for tf in range(len(itemFeatures[item]['textFeatures'])):
            itemFeatures[item]['textFeatures'][tf] /= itemFeatures[item]['reviewsNumber']
        
        
#        #critical
#        for feature in critical:
#            crit = False
#            for i in range(4):
#                for j in range(i,4):
#                    if len(critical[feature][i]) > 2:
#                        if len(critical[feature][j]) > 2:
#                            if sig_dif(critical[feature][i],critical[feature][j]) < 0.0501:
#                                crit = True
#                                print(feature,critical[feature][i],critical[feature][j])
#                                print(i,j,sig_dif(critical[feature][i],critical[feature][j]))
#                                print(np.average(critical[feature][i]),np.average(critical[feature][j]))
#            if crit:
#                itemFeatures[item]['critical'].append(feature)
        
        itemFeatures[item]['critical'] = critical.copy()
        
        if not it%1000:
            logger.debug('%d items complete'%it)
            
    #prepare IDF
    for feature in featuresDF:
        featuresDF[feature] = math.log(float(N)/featuresDF[feature])
    
    logger.debug('IDF prepared for %d items'%it)
    
    for it, item in enumerate(itemFeatures):
        for feature in itemFeatures[item]['featureFreq']:
            tf = float(itemFeatures[item]['featureFreq'][feature])/itemFeatures[item]['maxFreq']
            #print feature, tf
            idf = featuresDF[feature]
            itemFeatures[item]['tfidfDict'][feature] = round(tf*idf, 3)
            Ni = len(review_dict[item])
            t = round(100.*itemFeatures[item]['featureFreq'][feature]/Ni,2)
            itemFeatures[item]['featureFreq'][feature] = t
            
        itemFeatures[item]['tfidfList'] = [[itemFeatures[item]['tfidfDict'][feature],feature] 
                                           for feature in itemFeatures[item]['tfidfDict']]
        
        itemFeatures[item]['tfidfList'].sort(reverse = True)
        
        
        itemFeatures[item]['featureFreqList'] = [[itemFeatures[item]['featureFreq'][feature],feature] 
                                           for feature in itemFeatures[item]['featureFreq']]
        
        itemFeatures[item]['featureFreqList'].sort(reverse = True)
        
        if not it%1000:
            logger.debug('%d items completed'%it)
    
    return copy.deepcopy(itemFeatures)
Exemplo n.º 20
0
def applyFeatureExistance(busImportantFeatures, userImportantFeatures, testReviews, modelDict, path):
    logger = logging.getLogger('signature.applyFE.aFE')
    logger.info('starting applyFeatureExistance from %d reviews'%len(testReviews))
    fsw = featureStructureWorker()
    featureWeights = dict()
    featureSWeights = dict()
    
    featureQuality = dict()
    
    for k, feature in enumerate(fsw.featureIdicator):
#        print(k,feature)
#        if k > 15:
#            break
        
        
        if not fsw.featureIdicator[feature]:
            continue
        if feature not in modelDict:
            continue
        
        logger.debug('Start working with (%d) %s'%(k,feature))
        #get data
        X1, Y1, X2, Y2, missed = getFeatures(logger, feature, testReviews, busImportantFeatures, userImportantFeatures)
        
        
        #weight = frequency
        featureWeights[feature] = float(sum(Y1))/len(Y1)
        #weight = sentiment
        featureSWeights[feature] = float(sum(Y2))/len(Y2)
        
        '''
        Existence
        '''
        #Ypred = [int(x[1] > modelDict[feature][0])  for x in modelDict[feature][1].predict_proba(np.array(X1))]
        Ypred = modelDict[feature][1].predict(np.array(X1))
        Yreal = np.array(Y1)
        
        quality = list(f1_score(Yreal, Ypred, average=None)) 
        quality += list(precision_score(Yreal, Ypred, average=None)) 
        quality += list(recall_score(Yreal, Ypred, average=None))
        
        '''
        Sentiment
        '''
        #YSpred = [int(x[1] > modelDict[feature][2]) for x in modelDict[feature][3].predict_proba(np.array(X2))]
        YSpred = modelDict[feature][3].predict(np.array(X2))
        YSreal = np.array(Y2)
        
        qualityS = list(f1_score(YSreal, YSpred, average=None)) 
        qualityS += list(precision_score(YSreal, YSpred, average=None)) 
        qualityS += list(recall_score(YSreal, YSpred, average=None))
        
        featureQuality[feature]  = [round(featureWeights[feature],2), len(Y1)]
        featureQuality[feature] += [round(x,2) for x in quality]
        featureQuality[feature] += [round(featureSWeights[feature],2), len(Y2)]
        featureQuality[feature] += [round(x,2) for x in qualityS]
        
#        print(feature,featureQuality[feature])
        
        for r, review in enumerate(testReviews):
            existence = 0
            predictedExistence = 0
            
            X1, Y1, X2, Y2, missed = getFeatures(logger, feature, [review], busImportantFeatures, userImportantFeatures)
            if len(Y1): #check if the review has enough history
                review['exPredFeatures'] = review.get('exPredFeatures', {})
            
                existence = Y1[0]
                #print Yreal[r], Ypred[r], modelDict[feature][0]
                
                
                prediction = modelDict[feature][1].predict_proba(np.array(X1))[0][1] # probability of second class!!!
                #prediction = float(modelDict[feature][1].predict(np.array(X1))[0])
                #prediction = busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0
                if prediction >= modelDict[feature][0]:
                    predictedExistence = 1
                else:
                    predictedExistence = 0
                predictedExistence = prediction
#                print(X1[0], prediction, busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0)
                randomPrediction = random.random()#int(random.random() > 0.5)
                simplePrediction = busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0#int(busImportantFeatures[review['business_id']]['featureFreq'][feature] > 40)
                basePredictionPos = 1
                basePredictionNeg = 0
                
                #print(existence, predictedExistence, randomPrediction, simplePrediction, basePredictionPos, basePredictionNeg)
                
                
                review['exPredFeatures'][feature] = [existence, predictedExistence,
                                                     randomPrediction, simplePrediction, 
                                                     basePredictionPos, basePredictionNeg]
                    
                #print(feature, review['exPredFeatures'][feature])
            
            '''
            Sentiment
            '''
            if len(Y2):
                review['sentPredFeatures'] = review.get('sentPredFeatures', {})
            
                sentiment = Y2[0]
                #print Yreal[r], Ypred[r], modelDict[feature][0]
                
                prediction = modelDict[feature][3].predict_proba(np.array(X2))[0][1]
                #prediction = float(modelDict[feature][3].predict(np.array(X2))[0])
                if prediction >= modelDict[feature][2]:
                    predictedSentiment = 1
                else:
                    predictedSentiment = 0
                predictedSentiment = prediction
                
                randomSPrediction = random.random()#int(random.random() > 0.5)
                simpleSPrediction = (busImportantFeatures[review['business_id']]['sentiment'].get(feature,[0.0,0])[0]+1)/2.0#int(busImportantFeatures[review['business_id']]['sentiment'].get(feature,[0.0,0])[0] >= -0.5)
                baseSPredictionPos = 1
                baseSPredictionNeg = 0
                
                review['sentPredFeatures'][feature] = [sentiment, predictedSentiment,
                                                       randomSPrediction, simpleSPrediction,
                                                       baseSPredictionPos, baseSPredictionNeg]
            
            if not r%5000:
                logger.debug('%d reviews processed'%r)
    
    return testReviews, featureWeights, featureQuality
Exemplo n.º 21
0
def applyFeatureExistance(busImportantFeatures, userImportantFeatures, testReviews, modelDict, trainAveragesDict, path):
    logger = logging.getLogger('signature.aFE.applyFE')
    logger.info('starting applyFeatureExistance from %d reviews'%len(testReviews))
    fsw = featureStructureWorker()
    featureWeights = dict()
    featureF1 = dict()
    
    for i, feature in enumerate(fsw.featureIdicator):
        if not fsw.featureIdicator[feature]:
            continue
        logger.debug('Start working with %s'%feature)
        #get data
        X, Y = getFeatures(logger, feature, testReviews, busImportantFeatures, userImportantFeatures,
                                          trainAverages = trainAveragesDict[feature], is_train = False)
        
        #weight = frequency
        featureWeights[feature] = float(list(Y).count(1))/len(Y)
        
        Ypred = [x[1] for x in modelDict[feature][2].predict_proba(np.array(X))]
        Yreal = np.array(Y)
        
        Ybus = []
        for review in testReviews:
            busID = review['business_id']
            if busID in busImportantFeatures:
                pfreq = busImportantFeatures[busID]['featureFreq'].get(feature,0.0)
            else:
                pfreq = featureWeights[feature]
            Ybus.append(pfreq)
        
        featureF1[feature] = drawPR(feature,Yreal,Ypred,Ybus, modelDict[feature][0], path)
        
        for r, review in enumerate(testReviews):
            #reviewFeatures = fsw.getReviewFeaturesExistence(review['features'])
            review['exPredFeatures'] = review.get('exPredFeatures', {})
        
            existence = Yreal[r]
            #print Yreal[r], Ypred[r], modelDict[feature][0]
            if Ypred[r] >= modelDict[feature][0]:
                predictedExistence = 1
            else:
                predictedExistence = 0
                
            #check if feature important
            if existence + predictedExistence > 0.5:
                review['exPredFeatures'][feature] = [existence, predictedExistence]
                
            #print review['exPredFeatures']
            if not r%10000:
                logger.debug('%d reviews processed'%r)
        
    Jaccard = list()
    Jaccard_weighted = list()
    Jaccard_baseline = list()
    Jaccard_baseline_weighted = list()
    TP = 0
    FP = 0
    FN = 0
    
    TP_all = 0
    FP_all = 0
    FN_all = 0
    
    TP_bus = 0
    FP_bus = 0
    FN_bus = 0
    
    TP_int = 0
    FP_int = 0
    FN_int = 0
    
    
    for r, review in enumerate(testReviews):
        Jaccard_intersection = 0.0
        Jaccard_union = 0.0
        
        Jaccard_intersection_weighted = 0.0
        Jaccard_union_weighted = 0.0
        
        Jaccard_intersection_baseline = 0.0
        Jaccard_union_baseline = 0.0
        
        Jaccard_intersection_baseline_weighted = 0.0
        Jaccard_union_baseline_weighted = 0.0
        
        busID = review['business_id']
        if busID in busImportantFeatures:
            busAspects = set([f for f in busImportantFeatures[busID]['featureFreq'] if busImportantFeatures[busID]['featureFreq'][f] > 10 and
                                       busImportantFeatures[busID]['sentiment'][f][1] > 1])
        else:
            busAspects = set([f for f in fsw.featureIdicator if fsw.featureIdicator[feature]])
            
            
#        userID = review['user_id']
#        if userID in userImportantFeatures:
#            userAspects = set([f for f in userImportantFeatures[userID]['featureFreq'] if userImportantFeatures[userID]['featureFreq'][f] > 10 and
#                                       userImportantFeatures[userID]['sentiment'][f][1] > 1])
#        else:
#            userAspects = set([f for f in fsw.featureIdicator if fsw.featureIdicator[feature]])
        
            
        #interBU = userAspects.intersection(busAspects)
        #buildin INTERSECTION
        busID = review['business_id']
        if busID in busImportantFeatures:
            busImpAspects = set([f for f in busImportantFeatures[busID]['featureFreq'] if busImportantFeatures[busID]['featureFreq'][f] > 50 and
                                       busImportantFeatures[busID]['sentiment'][f][1] > 1])
            busIntAspects = set([f for f in busImportantFeatures[busID]['featureFreq'] if busImportantFeatures[busID]['featureFreq'][f] > 10 and
                                       busImportantFeatures[busID]['sentiment'][f][1] > 1])
        else:
            busImpAspects = set([f for f in fsw.featureIdicator if fsw.featureIdicator[feature]])
            busIntAspects = set([f for f in fsw.featureIdicator if fsw.featureIdicator[feature]])
            
            
        userID = review['user_id']
        if userID in userImportantFeatures:
            userAspects = set([f for f in userImportantFeatures[userID]['featureFreq'] if userImportantFeatures[userID]['featureFreq'][f] > 10 and
                                       userImportantFeatures[userID]['sentiment'][f][1] > 1])
        else:
            userAspects = set([f for f in fsw.featureIdicator if fsw.featureIdicator[feature]])
        
        interBU = busImpAspects.union(userAspects.intersection(busIntAspects))
        
        
        for feature in review['exPredFeatures']:
            if review['exPredFeatures'][feature] == [1,1]:
                TP += 1
            elif review['exPredFeatures'][feature] == [0,1]:
                FP += 1
            if review['exPredFeatures'][feature] == [1,0]:
                FN += 1
            
            #baseline all
            if review['exPredFeatures'][feature][0] == 1:
                TP_all += 1
           
            #baseline business
            if feature in busAspects and review['exPredFeatures'][feature][0] == 1:
                TP_bus += 1
            elif feature in busAspects and review['exPredFeatures'][feature][0] == 0:
                FP_bus += 1
            elif feature not in busAspects and review['exPredFeatures'][feature][0] == 1:
                FN_bus += 1
            
            
            #baseline intersection
            if feature in interBU and review['exPredFeatures'][feature][0] == 1:
                TP_int += 1
            elif feature in interBU and review['exPredFeatures'][feature][0] == 0:
                FP_int += 1
            elif feature not in interBU and review['exPredFeatures'][feature][0] == 1:
                FN_int += 1
            #print TP_int, FP_int, FN_int
            
            
            if review['exPredFeatures'][feature] == [1,1]:
                Jaccard_intersection += 1.0
                Jaccard_intersection_weighted += featureWeights[feature]
            Jaccard_union += 1.0
            Jaccard_union_weighted += featureWeights[feature]
            
            if review['exPredFeatures'][feature][0] == 1:
                Jaccard_intersection_baseline  += 1.0
                Jaccard_intersection_baseline_weighted += featureWeights[feature]
        
        for feature in fsw.featureIdicator:
            if fsw.featureIdicator[feature]:
                FP_all += 1
                
                Jaccard_union_baseline += 1
                Jaccard_union_baseline_weighted += featureWeights[feature]
                
        
        if Jaccard_union:
            Jaccard.append(Jaccard_intersection/Jaccard_union)       
        if Jaccard_union_weighted:
            Jaccard_weighted.append(Jaccard_intersection_weighted/Jaccard_union_weighted)
        if Jaccard_union_baseline:
            Jaccard_baseline.append(Jaccard_intersection_baseline/Jaccard_union_baseline)
        if Jaccard_union_baseline_weighted:
            Jaccard_baseline_weighted.append(Jaccard_intersection_baseline_weighted/Jaccard_union_baseline_weighted)
    
    #SIGNATURE METHOD    
    Presision = float(TP)/(TP+FP)
    Recall = float(TP)/(TP+FN)
    F1 = 2*Presision*Recall/(Presision+Recall)
    PreRec = [Presision,Recall,F1]
    
    #baseline ALL
    Presision_all = float(TP_all)/(TP_all+FP_all)
    Recall_all = float(TP_all)/(TP_all+FN_all)
    F1_all = 2*Presision_all*Recall_all/(Presision_all+Recall_all)
    PreRec_all = [Presision_all,Recall_all,F1_all]
    
    #baseline BUSINESS
    Presision_bus = float(TP_bus)/(TP_bus+FP_bus)
    Recall_bus = float(TP_bus)/(TP_bus+FN_bus)
    F1_bus = 2*Presision_bus*Recall_bus/(Presision_bus+Recall_bus)
    PreRec_bus = [Presision_bus,Recall_bus,F1_bus]
    
    #print TP_int, FP_int
    #baseline INTERSECTION
    Presision_int = float(TP_int)/(TP_int+FP_int)
    Recall_int = float(TP_int)/(TP_int+FN_int)
    F1_int = 2*Presision_int*Recall_int/(Presision_int+Recall_int)
    PreRec_int = [Presision_int,Recall_int,F1_int]
    
    
    return testReviews, featureWeights, [[np.average(Jaccard), np.average(Jaccard_weighted)],
                         [np.average(Jaccard_baseline), 
                          np.average(Jaccard_baseline_weighted)]], featureF1, [PreRec,PreRec_all,
                                                                               PreRec_bus, PreRec_int]
Exemplo n.º 22
0
def applySentimentMF(testReviews, modelDict, featureThres, featureWeights):
    logger = logging.getLogger('signature.aSMF.applySentimentMF')
    logger.info('starting applySentimentMatrixFactorization from %d reviews'%len(testReviews))
    fsw = featureStructureWorker()
    
    
    feature_data = dict()
    reviewDict = dict()
    
    for r, review in enumerate(testReviews):
        review['predSentiments'] = dict()
        busID = review['business_id']
        userID = review['user_id']
        sentiments = fsw.getReviewFeaturesSentiment(review['features'])
        #print sentiments
        ID = busID+'###'+userID
        reviewDict[ID] = review
        for feature in review['exPredFeatures']:
            if not fsw.featureIdicator.get(feature, None):
                continue
            sentiment = np.average(sentiments.get(feature,[0.0]))
            if feature in feature_data:
                feature_data[feature]['id'].append(ID)
                feature_data[feature]['user'].append(userID)
                feature_data[feature]['item'].append(busID)
                feature_data[feature]['rating'].append(sentiment)
            else:
                feature_data[feature] = {'id':[ID],'user':[userID],'item':[busID],'rating':[sentiment]}
        if not r%1000:
            logger.debug('%d reviews processed'%r)   
    
    rmse = list()
    rmse_weighted = list()
    rmse_baseline = list()
    rmse_baseline_weighted = list()
    
    accuracy = list()
    accuracy_weighted = list()
    accuracy_baseline = list()
    accuracy_baseline_weighted = list()
    
    
    weighted_sum = list()
    for f, feature in enumerate(feature_data):
#        if f > 0:
#            break
        #print feature, feature_data[feature]
        testData = graphlab.SFrame(feature_data[feature])
        prediction = modelDict[feature].predict(testData)
        testData['prediction'] = prediction
        
        for i,ID in enumerate(testData['id']):
#            if testData['prediction'][i] == featureThres[feature]:
#                sent_pred = 0.0
#            sent_pred = 1.0 if testData['prediction'][i] > featureThres[feature] else -1.0
            sent_pred = testData['prediction'][i]# - featureThres[feature]
            reviewDict[ID]['predSentiments'][feature] = sent_pred
            
            #print reviewDict[ID]['features']
            real_sent = feature_data[feature]['rating'][i]
            #print real_sent,sent_pred, accuracy
            if real_sent*sent_pred > 0.0:
                accuracy.append(1.0)
                accuracy_weighted.append(featureWeights[feature])
            elif real_sent*sent_pred < 0.0:
                accuracy.append(0.0)
                accuracy_weighted.append(0.0)
            
            #print real_sent,sent_pred, accuracy
            
            
            if real_sent*featureThres[feature] > 0:
                accuracy_baseline.append(1.0)
                accuracy_baseline_weighted.append(featureWeights[feature])
            elif real_sent*featureThres[feature] < 0:
                accuracy_baseline.append(0.0)
                accuracy_baseline_weighted.append(0.0)
            
            
            rmse.append(pow((real_sent-sent_pred),2))
            rmse_weighted.append(pow((real_sent-sent_pred),2)*featureWeights[feature])
            rmse_baseline.append(pow((real_sent-featureThres[feature]),2))
            rmse_baseline_weighted.append(pow((real_sent-featureThres[feature]),2)*featureWeights[feature])
            weighted_sum.append(featureWeights[feature])
            
        if not f%1:
            logger.debug('%d features sentiments predicted'%f)
    
    
    #
    
    #RMSE
    rmse = np.average(rmse)
    #weighted rmse
    rmse_weighted = np.sum(rmse_weighted)/np.sum(weighted_sum)
    #rmse baseline
    rmse_baseline = np.average(rmse_baseline)
    #rmse baseline weighted
    rmse_baseline_weighted = np.sum(rmse_baseline_weighted)/np.sum(weighted_sum)
    
    
    #ACCURACY
    accuracy = np.average(accuracy)
    #weighted accuracy
    accuracy_weighted = np.sum(accuracy_weighted)/np.sum(weighted_sum)
    #accuracy baseline
    accuracy_baseline = np.average(accuracy_baseline)
    #accuracy baseline weighted
    accuracy_baseline_weighted = np.sum(accuracy_baseline_weighted)/np.sum(weighted_sum)
    
    
    #weighted accuracy
    
    return [reviewDict[i] for i in reviewDict], [rmse,rmse_weighted,rmse_baseline,rmse_baseline_weighted,
                                                 accuracy,accuracy_weighted,accuracy_baseline,accuracy_baseline_weighted]
Exemplo n.º 23
0
def applySentimentMF(testReviews,  modelDict_ex, featureThres_ex, modelDict, featureThres):
    logger = logging.getLogger('signature.aSMF.applySentimentMF')
    logger.info('starting applySentimentMatrixFactorization from %d reviews'%len(testReviews))
    fsw = featureStructureWorker()
    
    
    aspect_data = dict()
    reviewDict_ex = dict()
    reviewDict = dict()
    
    for r, review in enumerate(testReviews):
        busID = review['business_id']
        userID = review['user_id']
        reviewID = review['review_id']
        for aspect in fsw.featureIdicator:
            if not fsw.featureIdicator.get(aspect, None):
                continue
            
            if aspect in aspect_data:
                aspect_data[aspect]['id'].append(reviewID)
                aspect_data[aspect]['user'].append(userID)
                aspect_data[aspect]['item'].append(busID)
            else:
                aspect_data[aspect] = {'id':[reviewID],'user':[userID],'item':[busID]}
        if not r%5000:
            logger.debug('%d reviews processed'%r)   
   
   
    for f, aspect in enumerate(aspect_data):
        logger.info('Prosessing  (%d) %s'%(f,aspect))
        if aspect not in modelDict_ex or aspect not in modelDict:
            continue
        testData = graphlab.SFrame(aspect_data[aspect])
#        print('test prepared')
        prediction_ex = modelDict_ex[aspect].predict(testData)
        prediction = modelDict[aspect].predict(testData)
#        print('sentiment predicted')
        testData['prediction_ex'] = prediction_ex
        testData['prediction'] = prediction
        
        #existence
        testData_prediction_ex = list(testData['prediction_ex'])
        for i,prediction_ex in enumerate(testData_prediction_ex):
            reviewID = aspect_data[aspect]['id'][i]
            reviewDict_ex[reviewID] = reviewDict_ex.get(reviewID,{})
            
            ex_pred_adjust = (prediction_ex*0.5/featureThres_ex[aspect])
            if ex_pred_adjust < 0:
                ex_pred_adjust = 0
            if ex_pred_adjust > 1:
                ex_pred_adjust = 1
            reviewDict_ex[reviewID][aspect] = ex_pred_adjust
        
        #sentiment
        testData_prediction = list(testData['prediction'])
        for i,sent_prediction in enumerate(testData_prediction):
            reviewID = aspect_data[aspect]['id'][i]
            reviewDict[reviewID] = reviewDict.get(reviewID,{})
            
            sent_pred_adjust = (sent_prediction*0.5/featureThres[aspect])
            if sent_pred_adjust < 0:
                sent_pred_adjust = 0
            if sent_pred_adjust > 1:
                sent_pred_adjust = 1
            reviewDict[reviewID][aspect] = sent_pred_adjust
            
        if not f%1:
            logger.debug('%d features sentiments predicted'%f)
    
    return reviewDict_ex, reviewDict
Exemplo n.º 24
0
def applySMF(path, limit = np.Inf):
    logger = logging.getLogger('signature.aSMF')
    logger.info('starting applySentimentMF')
    #get data
    r_file = path+'yelp_reviews_test_predictions.json'
    
    testReviews = list()
    for counter, line in enumerate(open(r_file,'r')):
        if not counter%1000:
            logger.debug('%d reviews loaded'%counter)
        if counter > limit:
            break
        testReviews.append(json.loads(line.strip()))
    logger.info('Test Reviews loaded from %s'%r_file)
    
    #load model
    modelDict = dict()
    featureThres = dict()
    fsw = featureStructureWorker()
    for feature in fsw.featureIdicator:
        if not fsw.featureIdicator[feature]:
            continue
        try:
            modelPath = path + '/sentimentModels/%s_sentiment.model'%feature
            print modelPath
            modelDict[feature] = graphlab.load_model(modelPath)
            
            #load average
            thres_path = path+'/sentimentModels/%s_sentiment.threshold'%feature
            infile = open(thres_path,'r')
            featureThres[feature] = float(infile.readline().strip())
            infile.close()
        except:
            logger.error('There is no model for feature: %s'%feature)
            continue
        
    logger.info('Models loaded')
    
    #load featureWeights
    infile = open(path+'/featureWeights.json','r')
    featureWeights = json.loads(infile.readline().strip())
    infile.close()
    
    #run function
    reviewsPrediction, results = applySentimentMF(testReviews, modelDict, featureThres, featureWeights)
    
    #save result
    outfile = open(path+'yelp_reviews_test_predictions.json','w')
    for review in reviewsPrediction:
        outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n')
    outfile.close()
    
    try:
        os.stat(path+'/results/')
    except:
        os.mkdir(path+'/results/')
    outfile = open(path+'/results/Sentiment_prediction.txt','w')
    outfile.write('RMSE = %f\nRMSE_weighted = %f'%(results[0], results[1]))
    outfile.write('\n\nRMSE_baseline = %f\nRMSE_baseline_weighted = %f'%(results[2], results[3]))
    outfile.write('\n===============\n\nAccuracy = %f\nAccuracy_weighted = %f'%(results[4], results[5]))
    outfile.write('\n\nAccuracy_baseline = %f\nAccuracy_baseline_weighted = %f'%(results[6], results[7]))
    outfile.close()
Exemplo n.º 25
0
def featureImportance(review_dict, ignore_neutral = True):
    logger = logging.getLogger('signature.IFI.fI')
    logger.info('starting featureImportance')
    fsw = featureStructureWorker()
    N = len(review_dict)
    featuresDF = dict() # dictionary for counting Document Frequency
    itemFeatures = dict() # main dictionary with statistics (item:stat)
    for it, item in enumerate(review_dict):
        itemFeatures[item] = {'tfidfDict':{},'featureFreq':{},'sentiment':{},'reviewsNumber':0, 'maxFreq':0, 'textFeatures':[]}
        itemFeatures[item]['reviewsNumber'] = len(review_dict[item])
        for r, review in enumerate(review_dict[item]):
            reviewFeatures = fsw.getReviewFeaturesExistence(review['features'])
            #print reviewFeatures
            for feature in reviewFeatures:
                #work with frequency
                itemFeatures[item]['featureFreq'][feature] = itemFeatures[item]['featureFreq'].get(feature,0)
                itemFeatures[item]['featureFreq'][feature] += 1
                #work with sentiment
                itemFeatures[item]['sentiment'][feature] = itemFeatures[item]['sentiment'].get(feature,[])
                if len(reviewFeatures[feature]):
                    if ignore_neutral:
                        arr = [x for x in reviewFeatures[feature] if x]
                        if len(arr):
                            itemFeatures[item]['sentiment'][feature].append(np.average(arr))
                        else:
                            itemFeatures[item]['sentiment'][feature].append(0.0)
                    else:
                        itemFeatures[item]['sentiment'][feature].append(np.average(reviewFeatures[feature]))
                else:
                    itemFeatures[item]['sentiment'][feature].append(0.0)
            
            
            #print review.keys()
            if not len(itemFeatures[item]['textFeatures']):
                for tf in review['textFeatures']:
                    itemFeatures[item]['textFeatures'].append(tf)
            else:
                for i,tf in enumerate(review['textFeatures']):
                    itemFeatures[item]['textFeatures'][i] += tf
            
#            if not r%10:
#                logger.debug('%d reviews'%r)
            
                    
        for feature in itemFeatures[item]['featureFreq']:
            #work with frequency
            if itemFeatures[item]['featureFreq'][feature] > itemFeatures[item]['maxFreq']:
                itemFeatures[item]['maxFreq'] = itemFeatures[item]['featureFreq'][feature]
            #work with sentiment
            itemFeatures[item]['sentiment'][feature] = [round(np.average(itemFeatures[item]['sentiment'][feature]),3),
                                     len(itemFeatures[item]['sentiment'][feature])]
            #work with 'Document' Frequency (DF)
            featuresDF[feature] = featuresDF.get(feature, 0)
            featuresDF[feature] += 1
        
        for tf in range(len(itemFeatures[item]['textFeatures'])):
            itemFeatures[item]['textFeatures'][tf] /= itemFeatures[item]['reviewsNumber']
        
        if not it%1000:
            logger.debug('%d items'%it)
            
    #prepare IDF
    for feature in featuresDF:
        featuresDF[feature] = math.log(float(N)/featuresDF[feature])
    
    logger.debug('IDF prepared for %d items'%it)
    
    for it, item in enumerate(itemFeatures):
        for feature in itemFeatures[item]['featureFreq']:
            tf = float(itemFeatures[item]['featureFreq'][feature])/itemFeatures[item]['maxFreq']
            #print feature, tf
            idf = featuresDF[feature]
            itemFeatures[item]['tfidfDict'][feature] = round(tf*idf, 3)
            Ni = len(review_dict[item])
            t = round(100.*itemFeatures[item]['featureFreq'][feature]/Ni,2)
            itemFeatures[item]['featureFreq'][feature] = t
            
        itemFeatures[item]['tfidfList'] = [[itemFeatures[item]['tfidfDict'][feature],feature] 
                                           for feature in itemFeatures[item]['tfidfDict']]
        
        itemFeatures[item]['tfidfList'].sort(reverse = True)
        
        
        itemFeatures[item]['featureFreqList'] = [[itemFeatures[item]['featureFreq'][feature],feature] 
                                           for feature in itemFeatures[item]['featureFreq']]
        
        itemFeatures[item]['featureFreqList'].sort(reverse = True)
        
        if not it%1000:
            logger.debug('%d items completed'%it)
    return copy.deepcopy(itemFeatures)
Exemplo n.º 26
0
def learnSentimentMatrixFactorization(trainReviews, path):
    logger = logging.getLogger('signature.learnSentimentMF.Worker')
    logger.info('starting learnSentimentMatrixFactorization from %d reviews'%len(trainReviews))
    fsw = featureStructureWorker()
    modelDict_ex = dict()
    featureThres_ex = dict()
    
    modelDict = dict()
    featureThres = dict()
    for i, feature in enumerate(fsw.featureIdicator):
#        if feature != 'SERVICE':
#            continue
        if not fsw.featureIdicator[feature]:
            continue
        
        logger.debug('Start working with (%d) %s'%(i,feature))
        
        learnData_ex = {'user':[],'item':[],'rating':[]}
        learnData = {'user':[],'item':[],'rating':[]}
        
        
        
        for j, review in enumerate(trainReviews):
            reviewFeatures = fsw.getReviewFeaturesSentiment(review['features'])
            
            
            busID = review['business_id']
            userID = review['user_id']
            
            learnData_ex['user'].append(userID)
            learnData_ex['item'].append(busID)
            if feature in reviewFeatures:
                learnData_ex['rating'].append(1)
            else:
                learnData_ex['rating'].append(0)
                
                
            if feature not in reviewFeatures:
                continue   
            
            sent = np.average(reviewFeatures[feature])
            if sent:
                learnData['user'].append(userID)
                learnData['item'].append(busID)
                
                if sent > 0:
                    learnData['rating'].append(1)
                elif sent < 0:
                    learnData['rating'].append(0)
            
            
        if len(learnData_ex['rating']):
            data_ex = graphlab.SFrame(learnData_ex)
            featureThres_ex[feature], modelDict_ex[feature] = getBestMFThres(logger, feature, data_ex, path)

           
        #CROSSS VALIDATION
        if len(learnData['rating']):
            data = graphlab.SFrame(learnData)
            featureThres[feature], modelDict[feature] = getBestMFThres(logger, feature, data, path)

    return modelDict_ex, featureThres_ex, modelDict, featureThres
Exemplo n.º 27
0
def getFeatures(
    logger, feature, reviewsSet, busImportantFeatures, userImportantFeatures, trainAverages={}, is_train=True
):

    business_dict, user_dict = loadData(logger)
    gP = genderPredictor()
    gP.load()
    cW = categoryWorker()
    cW.load()

    if is_train:
        trainAverages = {"mean": [], "std": []}
    else:
        pass
        # load trainAverages

    fsw = featureStructureWorker()
    X = list()
    Y = list()

    for review in reviewsSet:
        reviewFeatures = fsw.getReviewFeaturesExistence(review["features"])
        if feature in reviewFeatures:
            existance = 1
        else:
            existance = 0

        busID = review["business_id"]
        userID = review["user_id"]

        bus_basic_features = getBasicFeatures(feature, busID, busImportantFeatures, is_train)
        user_basic_features = getBasicFeatures(feature, userID, userImportantFeatures, is_train)
        bus_additional_features = getBusinessFeatures(busID, business_dict, cW)
        user_additional_features = getUserFeatures(userID, user_dict, gP)

        #        if not bus_basic_features or not user_basic_features:
        #            continue
        # sex = [review['usersSex']]

        Y.append(existance)
        X.append(bus_basic_features + user_basic_features + bus_additional_features + user_additional_features)  # +sex)

        if is_train:
            if not len(trainAverages["mean"]):
                for i in range(len(X[0])):
                    trainAverages["mean"].append([])
                    trainAverages["std"].append([])

            for i, value in enumerate(X[-1]):
                if value != None:
                    trainAverages["mean"][i].append(value)
    # count means
    if is_train:
        for i in range(len(trainAverages["mean"])):
            trainAverages["std"][i] = np.std(trainAverages["mean"][i])
            trainAverages["mean"][i] = np.average(trainAverages["mean"][i])

    # normalization
    for vector in X:
        for i in range(len(vector)):
            if vector[i] == None:
                vector[i] = 0.0
            else:
                if trainAverages["std"][i]:
                    vector[i] = (vector[i] - trainAverages["mean"][i]) / trainAverages["std"][i]
                else:
                    vector[i] = vector[i] - trainAverages["mean"][i]

    if is_train:
        return X, Y, trainAverages
    else:
        return X, Y
Exemplo n.º 28
0
def learnFeatureExistance(busImportantFeatures, userImportantFeatures, trainReviews, path):
    logger = logging.getLogger('signature.learnFE')
    logger.info('starting learnFeatureExistance from %d reviews'%len(trainReviews))
    fsw = featureStructureWorker()
    modelDict = dict()
    
    missed_prediction = dict()
    for f, feature in enumerate(fsw.featureIdicator):
        if not fsw.featureIdicator[feature]:
            continue
        logger.info('Start working with (%d) %s'%(f,feature))
        #get data
        X1, Y1, X2, Y2, missed = getFeatures(logger, feature, trainReviews, busImportantFeatures, userImportantFeatures)
        missed_prediction[feature] = [missed, len(Y1)]
        
        
#        stat_line = '%d (%d/%d)reviews (%d of them pos(%d)/neg(%d))'%(len(Y1),sum(Y1),len(Y1) - sum(Y1),
#                                                                      len(Y2),sum(Y2),len(Y2) - sum(Y2))

        logger.debug('Got features for %d (%d/%d)reviews (%d of them pos(%d)/neg(%d))'%(len(Y1),sum(Y1),len(Y1) - sum(Y1),
                                                                                     len(Y2),sum(Y2),len(Y2) - sum(Y2)))

        print(len(Y1),len(Y2))
        if len(Y1) < 100 or sum(Y1) < 50 or len(Y1) - sum(Y1) < 50:
            continue
        if len(Y2) < 100 or sum(Y2) < 50 or len(Y2) - sum(Y2) < 50:
            continue

#        if len(Y1) < 10 or sum(Y1) < 10 or len(Y1) - sum(Y1) < 10:
#            continue
#        if len(Y2) < 10 or sum(Y2) < 10 or len(Y2) - sum(Y2) < 10:
#            continue


#        #cross validation
#        indicator = range(len(X))
#        random.shuffle(indicator)
#        thres = int(len(indicator)*0.8)
#        trainX = np.array([X[i] for i in indicator[:thres]])
#        trainY = np.array([Y[i] for i in indicator[:thres]])
#        testX = np.array([X[i] for i in indicator[thres:]])
#        testY = np.array([Y[i] for i in indicator[thres:]])
        
        #Logistic Regression
        bestThres, bestQ,logmodel = getLogModel(logger, feature, X1, Y1, path)
        
        logger.info('Sentiment prediction for (%d) %s'%(f,feature))
        #Logistic Regression
        bestThres_2, bestQ_2, logmodel_2 = getLogModel(logger, feature, X2, Y2, path)
        
        
        feat_info = [len(Y1), sum(Y1), len(Y1) - sum(Y1)] + bestQ + [len(Y2), sum(Y2),len(Y2) - sum(Y2)] + bestQ_2
        
        #bestThresSVM,bestF1SVM,svmmodel = getBestSVMModel(logger, feature, X, Y, path)
        
#       crossValidation(logger, np.array(X), np.array(Y))
        
        
        modelDict[feature] = [bestThres, logmodel, bestThres_2, logmodel_2, feat_info]
        
#        print(f)
#        if f > 6:
#            break
        
    return modelDict
Exemplo n.º 29
0
def aspectStat(path):
    logger = logging.getLogger('signature.aspectStat')
    logger.info('start computing aspect Stat')
    #get data
    b_file = path+'/businessProfile.json'
    u_file = path+'/userProfile.json'
    
    busImportantFeatures = json.loads(open(b_file,'r').readline())
    logger.info('Important BUSINESS Features loaded')
    userImportantFeatures = json.loads(open(u_file,'r').readline())
    logger.info('Important USER Features loaded')

    
    aspectStat = dict()
    
    fsw = featureStructureWorker()
    for f, aspect in enumerate(fsw.featureIdicator):
        aspectStat[aspect] = {'total':0, 'bus10':0, 'user10':0, 'posNum':0, 'negNum':0,
                              'busDiff+-':0, 'userDiff+-':0, 'busDiff01':0, 'userDiff01':0}
        for busID in busImportantFeatures:
            bus_reviews = busImportantFeatures[busID]['reviewsNumber']
            bus_freq = busImportantFeatures[busID]['featureFreq'].get(aspect,0.0)
            aspectStat[aspect]['total'] += bus_freq/100.0 * bus_reviews
            if aspect in busImportantFeatures[busID]['critical']:
                aspectStat[aspect]['posNum'] += len(busImportantFeatures[busID]['critical'][aspect]['+'])
                aspectStat[aspect]['negNum'] += len(busImportantFeatures[busID]['critical'][aspect]['-'])
            
            if bus_freq > 10:
                aspectStat[aspect]['bus10'] += 1
                
                if aspect in busImportantFeatures[busID]['critical']:
                    exist = busImportantFeatures[busID]['critical'][aspect]['1']
                    pos = busImportantFeatures[busID]['critical'][aspect]['+']
                    neg = busImportantFeatures[busID]['critical'][aspect]['-']
#                    neutr = busImportantFeatures[busID]['critical'][aspect]['0']
                    none = busImportantFeatures[busID]['critical'][aspect]['n']
                    
                    if sig_dif(pos,neg) < 0.10501:
                        aspectStat[aspect]['busDiff+-'] += 1
                    
                    if sig_dif(exist,none) < 0.10501:
                        aspectStat[aspect]['busDiff01'] += 1
                        
        
        for userID in userImportantFeatures:
#            user_reviews = userImportantFeatures[userID]['reviewsNumber']
            user_freq = userImportantFeatures[userID]['featureFreq'].get(aspect,0.0)
            
            if user_freq > 1:
                aspectStat[aspect]['user10'] += 1
                
                if aspect in userImportantFeatures[userID]['critical']:
                    exist = userImportantFeatures[userID]['critical'][aspect]['1']
                    pos = userImportantFeatures[userID]['critical'][aspect]['+']
                    neg = userImportantFeatures[userID]['critical'][aspect]['-']
#                    neutr = userImportantFeatures[userID]['critical'][aspect]['0']
                    none = userImportantFeatures[userID]['critical'][aspect]['n']
                    
                    if sig_dif(pos,neg) < 0.10501:
                        aspectStat[aspect]['userDiff+-'] += 1
                    
                    if sig_dif(exist,none) < 0.10501:
                        aspectStat[aspect]['userDiff01'] += 1
                    
        logger.debug('done with (%d) %s'%(f,aspect))
    
    try:
        os.stat(path+'results/')
    except:
        os.mkdir(path+'results/')
    
    outfile = open(path+'/results/aspectStatistics.txt','w')
    outfile.write('total\tbus10\tuser10\tposNum\tnegNum\tbusDiff+-\tuserDiff+-\tbusDiff01\tuserDiff01\n')
    
    aspects = list(aspectStat.keys())
    aspects.sort()
    for aspect in aspects:
        r = aspectStat[aspect]
        outfile.write('%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n'%(aspect, r['total'], r['bus10'], r['user10'],
                                                                  r['posNum'],r['negNum'],
                                                                  r['busDiff+-'], r['userDiff+-'],
                                                                  r['busDiff01'], r['userDiff01']))
    outfile.close()
Exemplo n.º 30
0
def applyTopicModel(logger, path, topic_num):
    stat_file = path + "yelp_reviews_features_stat.json"
    train_file = path + "yelp_reviews_features_train.json"
    extrain_file = path + "yelp_reviews_features_extrain.json"
    test_file = path + "yelp_reviews_features_test.json"

    # load model
    model_path = path + "modelLDA/"
    dictionary = corpora.Dictionary.load(model_path + "dictionary_%d.lda" % topic_num)
    logger.info("Dictionary loaded from: " + model_path + "dictionary_%d.lda" % topic_num)

    lda_model = models.ldamodel.LdaModel.load(model_path + "model_%d.lda" % topic_num)
    logger.info("Model loaded from:" + model_path + "model_%d.lda" % topic_num)

    files = [stat_file, train_file, extrain_file, test_file]
    fsw = featureStructureWorker()

    for infile in files:
        reviews = list()
        for counter, line in enumerate(open(infile, "r")):
            if not counter % 10000:
                logger.debug("%d reviews loaded" % counter)
            # print infile, line
            # load review information
            review = json.loads(line.strip())
            reviews.append(review)

        #        outfile = open(infile.replace('.json','_old.json'),'w')
        #        for review in reviews:
        #            outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n')
        #            outfile.close()

        #        outfile = open(infile,'w')
        #        outname = infile.replace('.json','_old.json')
        outname = infile
        print outname
        outfile = open(outname, "w")
        for counter, review in enumerate(reviews):
            if not counter % 1000:
                logger.debug("%d reviews loaded" % counter)

            text = list()
            for sentence in review["features"]:
                for aspect in review["features"][sentence]:
                    text.append(aspect + "_%s" % review["features"][sentence][aspect].strip())

            topics = lda_model[dictionary.doc2bow(text)]

            res = dict()
            if len(topics):
                res["1"] = topTopics(topics)
            # print topics, res

            if "features_sent" not in review:
                review["features_sent"] = review["features"].copy()
            review["features"] = res.copy()

            outfile.write(json.dumps(review).encode("utf8", "ignore") + "\n")

        #            if counter > 10:
        #                break
        outfile.close()