def initialize(inputFileDir, rdr): (parentUserIdToUserDict, parentBusinessIdToBusinessDict, parent_reviews) =\ rdr.readData(inputFileDir) print len(parent_reviews) bnssEdited = 0 for bnssId in parentBusinessIdToBusinessDict.iterkeys(): if bnssId in parentBusinessIdToBusinessDict: parentBusinessIdToBusinessDict[bnssId].setRating(parentBusinessIdToBusinessDict[bnssId].getRating()) bnssEdited+=1 print bnssEdited parent_graph = OldGraphUtil.createGraph(parentUserIdToUserDict,parentBusinessIdToBusinessDict,parent_reviews) # unnecessary_reviews = set() # cc = sorted(networkx.connected_component_subgraphs(parent_graph,False), key=len, reverse=True) # for g in cc: # g.initializeDicts() # # for g in cc: # usr_count = g.getUserCount() # bnss_count = g.getBusinessCount() # if(usr_count==1): # usr = g.getUser(g.getUserIds()[0]) # for bnss in g.neighbors(usr): # review = g.getReview(usr.getId(),bnss.getId()) # unnecessary_reviews.add(review.getId()) # if(bnss_count==1): # bnss = g.getBusiness(g.getBusinessIds()[0]) # for usr in g.neighbors(bnss): # review = g.getReview(usr.getId(),bnss.getId()) # unnecessary_reviews.add(review.getId()) # print len(unnecessary_reviews) # cross_9_months_graphs = SIAUtil.createTimeBasedGraph(parentUserIdToUserDict, parentBusinessIdToBusinessDict, parent_reviews, '9-M') cross_time_graphs = OldGraphUtil.createTimeBasedGraph(parentUserIdToUserDict,\ parentBusinessIdToBusinessDict,\ parent_reviews, '1-Y') beforeThreadTime = datetime.now() cross_time_lbp_runner_threads = [] for time_key in cross_time_graphs.iterkeys(): print '----------------------------------GRAPH-', time_key, '---------------------------------------------\n' lbp_runner = LBPRunnerThread(cross_time_graphs[time_key], 50, 'Initial LBP Runner for Time'+str(time_key)) cross_time_lbp_runner_threads.append(lbp_runner) lbp_runner.start() for lbp_runner in cross_time_lbp_runner_threads: lbp_runner.join() afterThreadTime = datetime.now() print 'Time to be reduced',afterThreadTime-beforeThreadTime return (cross_time_graphs, parent_graph)
def calculateMergeAbleAndNotMergeableBusinessesAcrossTime(cross_time_graphs, parent_graph, bnss_score_all_time_map): # calculate interesting businesses across time mergeable_businessids = dict() not_mergeable_businessids = dict() for bnss_key in bnss_score_all_time_map.iterkeys(): time_score_map = bnss_score_all_time_map[bnss_key] scores = [time_score_map[time_key][1] for time_key in time_score_map.iterkeys()] good_scores = OldGraphUtil.rm_outlier(scores) # print 'IN: ', scores # REMOVE # print 'OP: ', good_scores # print '*'*10 for time_key in time_score_map.iterkeys(): score = time_score_map[time_key][1] if(score in good_scores): if time_key not in mergeable_businessids: mergeable_businessids[time_key] = set() mergeable_businessids[time_key].add(bnss_key) else: if time_key not in not_mergeable_businessids: not_mergeable_businessids[time_key] = set() not_mergeable_businessids[time_key].add(bnss_key) for time_key in not_mergeable_businessids.iterkeys(): print 'Interesting businesses in time:', time_key,len(not_mergeable_businessids[time_key]) for time_key in mergeable_businessids.iterkeys(): print 'Not Interesting businesses in time:', time_key,len(mergeable_businessids[time_key]) return (mergeable_businessids,not_mergeable_businessids)
def run(inputFileDir, rdr): beforeGraphPopulationTime = datetime.now() (parentUserIdToUserDict,parentBusinessIdToBusinessDict,parent_reviews) = rdr.readData(inputFileDir) wholeGraph = OldGraphUtil.createGraph(parentUserIdToUserDict, parentBusinessIdToBusinessDict, parent_reviews) afterGraphPopulationTime = datetime.now() beforeStatisticsGenerationTime = datetime.now() print'----------------------Number of Users, Businesses, Reviews----------------------------------------------------------------------' print 'Number of Users- ', len(parentUserIdToUserDict.keys()),\ 'Number of Businesses- ', len(parentBusinessIdToBusinessDict.keys()),\ 'Number of Reviews- ', len(parent_reviews) print'----------------------Component Sizes----------------------------------------------------------------------' cc = sorted(nx.connected_component_subgraphs(wholeGraph,False), key=len, reverse=True) lenListComponents = [len(c.nodes()) for c in cc if len(c.nodes())>1 ] print lenListComponents G = wholeGraph #G = cc[1] #print'----------------------User to Neighbors Degree--------------------------------------------------------------' #for node in G.nodes(): # if node.getNodeType() == USER: # userToDegreeDict[node] = len(G.neighbors(node)) # else: # businessToDegreeDict[node] = len(G.neighbors(node)) #for user in userToDegreeDict.keys(): # print user.getId(),' ',userToDegreeDict[i] # userDegreeDistribution = [len(G.neighbors(node)) for node in G.nodes() if node.getNodeType() == SIAUtil.USER] #print userDegreeDistribution #print'----------------------Business to Neighbors Degree----------------------------------------------------------' #for business in businessToDegreeDict.keys(): # print business.getName(),' ',businessToDegreeDict[i] # businessDegreeDistribution = [len(G.neighbors(node)) for node in G.nodes() if node.getNodeType() == SIAUtil.PRODUCT] #print businessDegreeDistribution # print'----------------------Review Sentiment Distribution----------------------------------------------------------' # reviewSentimentDistribution = [ G.get_edge_data(*edge)[SIAUtil.REVIEW_EDGE_DICT_CONST].getRating()\ # G.get_edge_data(*edge)[SIAUtil.REVIEW_EDGE_DICT_CONST].getReviewSentiment(),\ # G.get_edge_data(*edge)[SIAUtil.REVIEW_EDGE_DICT_CONST].isRecommended())\ # for edge in G.edges()] # print reviewSentimentDistribution # print '---------------------- Mean And Variance of the Distributions ----------------------------------------------------------' # print 'Average Size Of a Component - ', numpy.mean(numpy.array(lenListComponents)),'Variance Of Component Size - ', numpy.var(numpy.array(lenListComponents)) # print 'Average Degree Of a User - ',numpy.mean(numpy.array(userDegreeDistribution)),'Variance Of User Degree - ', numpy.var(numpy.array(userDegreeDistribution)) # print 'Average Degree Of a Business - ',numpy.mean(numpy.array(businessDegreeDistribution)),'Variance Of Business Degree - ', numpy.var(numpy.array(businessDegreeDistribution)) print'------------------------------------------------------------------------------------------------------------' afterStatisticsGenerationTime = datetime.now() ########################################################## beforeLBPRunTime = datetime.now() lbp = LBP(G) print 'positive parent_reviews', len([lbp.getEdgeDataForNodes(*edge)\ for edge in G.edges()\ if lbp.getEdgeDataForNodes(*edge).getReviewSentiment()\ == SIAUtil.REVIEW_TYPE_POSITIVE]) print 'Negative parent_reviews', len([lbp.getEdgeDataForNodes(*edge)\ for edge in G.edges()\ if lbp.getEdgeDataForNodes(*edge).getReviewSentiment()\ == SIAUtil.REVIEW_TYPE_NEGATIVE]) ##################ALGO_START################ lbp.doBeliefPropagationIterative(1) (fakeUsers,honestUsers,unclassifiedUsers,\ badProducts,goodProducts,unclassifiedProducts,\ fakeReviewEdges,realReviewEdges,unclassifiedReviewEdges) = lbp.calculateBeliefVals() fakeReviews = [lbp.getEdgeDataForNodes(*edge) for edge in fakeReviewEdges] realReviews = [lbp.getEdgeDataForNodes(*edge) for edge in realReviewEdges] unclassifiedReviews = [lbp.getEdgeDataForNodes(*edge) for edge in unclassifiedReviewEdges] ##################ALGO_END################ print 'fakeUsers=', len(fakeUsers) print 'honestUsers=', len(honestUsers) print 'unclassfiedUsers=', len(unclassifiedUsers) print 'goodProducts=', len(goodProducts) print 'badProducts=', len(badProducts) print 'unclassfiedProducts=', len(unclassifiedProducts) print 'fakeReviews=', len(fakeReviews) print 'realReviews=', len(realReviews) print 'unclassfiedReviews=', len(unclassifiedReviews) ##################Accuracy calculation################# positiveReviewsInFakeReviews = [review for review in fakeReviews\ if lbp.getEdgeDataForNodes(lbp.getUser(review.getUserId()),\ lbp.getBusiness(review.getBusinessID())).getReviewSentiment() \ == SIAUtil.REVIEW_TYPE_POSITIVE] negativeReviewsInFakeReviews = [review for review in fakeReviews\ if lbp.getEdgeDataForNodes(lbp.getUser(review.getUserId()),\ lbp.getBusiness(review.getBusinessID())).getReviewSentiment() \ == SIAUtil.REVIEW_TYPE_NEGATIVE] realReviewsInFakeReviews = [review for review in fakeReviews\ if lbp.getEdgeDataForNodes(lbp.getUser(review.getUserId()),\ lbp.getBusiness(review.getBusinessID())).isRecommended()] fakeReviewsInRealReviews = [review for review in realReviews\ if not lbp.getEdgeDataForNodes(lbp.getUser(review.getUserId()),\ lbp.getBusiness(review.getBusinessID())).isRecommended()] unclassifiedFakeReviews = [review for review in unclassifiedReviews\ if not lbp.getEdgeDataForNodes(lbp.getUser(review.getUserId()),\ lbp.getBusiness(review.getBusinessID())).isRecommended()] unclassifiedRealReviews = [review for review in unclassifiedReviews\ if lbp.getEdgeDataForNodes(lbp.getUser(review.getUserId()),\ lbp.getBusiness(review.getBusinessID())).isRecommended()] print "Number of Positive Reviews in Fake Reviews",len(positiveReviewsInFakeReviews) print "Number of Negative Reviews in Fake Reviews",len(negativeReviewsInFakeReviews) print "Number of Real Reviews in Fake Reviews",len(realReviewsInFakeReviews) print "Number of Fake Reviews in Real Reviews",len(fakeReviewsInRealReviews) print "Number of Fake Reviews in Unclassified Reviews",len(unclassifiedFakeReviews) print "Number of Real Reviews in Unclassified Reviews",len(unclassifiedRealReviews) afterLBPRunTime = datetime.now() ########################################################### print'Graph Population time:', afterGraphPopulationTime-beforeGraphPopulationTime,\ 'Statistics Generation Time:', afterStatisticsGenerationTime-beforeStatisticsGenerationTime,\ 'Algo run Time:', afterLBPRunTime-beforeLBPRunTime