def main(argv): target_cities = argv[1:len(argv)] dirname = "yelp_dataset_challenge_academic_dataset/" users_filepath = dirname + "yelp_academic_dataset_user.json" businesses_filepath = dirname + "yelp_academic_dataset_business.json" reviews_filepath = dirname + "yelp_academic_dataset_review.json" print "loading " + users_filepath business = loadData(businesses_filepath) print "loading " + businesses_filepath user = loadData(users_filepath) print "loading " + reviews_filepath review = loadData(reviews_filepath) restaurant_category = set(['Food', 'Restaurants', 'Pizza', 'Coffee & Tea', 'Sandwiches', 'Breakfast & Brunch', 'Fast Food', 'Bakeries']) for target_city in target_cities: filter_and_dump(business, user, review, target_city, restaurant_category, dirname)
def main(argv): users_filepath = "../data/yelp_academic_dataset_user.json" businesses_filepath = "../data/yelp_academic_dataset_business.json" reviews_filepath = "../data/yelp_academic_dataset_review.json" # users = loadData(users_filepath) # businesses = loadData(businesses_filepath) reviews = loadData(reviews_filepath) business_id = "vcNAWiLM4dR7D2nwwJ7nCA" target_reviews = [] for review in reviews: if review["business_id"] == business_id: target_reviews.append(review) sorted(target_reviews, key=lambda x: x["data"]) print target_reviews
def predict_stars_rw_ubcw(input_file_path, input_file_path1, output_file_path, output_file_path1, sign, threshold=None): ########################################## ### Load data (business, user, word) ### ########################################## ''' total number of users: 366715 total number of businesses: 61184 total number of reviews: 1569264 ''' print "load yelp dataset..." dirname = "yelp_dataset_challenge_academic_dataset/" business_filename = dirname + "yelp_academic_dataset_business_Phoenix.json" user_filename = dirname + "yelp_academic_dataset_user_Phoenix.json" review_filename = dirname + "yelp_academic_dataset_review_Phoenix.json" business = loadData(business_filename) user = loadData(user_filename) review = loadData(review_filename) print "load business features..." word = {} with open("Phoenix.csv", "rb") as csvfile: reader = csv.DictReader(csvfile) for row in reader: business_id = row["business_id"] row.pop("business_id") for k, v in row.iteritems(): row[k] = int(v) word[business_id] = row word_topk = {} with open(input_file_path1, "rb") as csvfile: reader = csv.DictReader(csvfile) for row in reader: business_id = row["business_id"] row.pop("business_id") for k, v in row.iteritems(): v = v.strip().split(".")[0] row[k] = int(v) if v else 0 word_topk[business_id] = row ########################################## ### Build graph ### ########################################## print "build graph..." G = nx.DiGraph() print " add nodes..." ''' Node types: user ("type": "user") business ("type": "business", "stars": 3.5) word ("type": "category") ''' # user {"type": "user"} G.add_nodes_from( [ [ "USER#" + u["user_id"], {"type": "user"} ] for u in user ] ) # business {"type": "business", "stars": 3.5} G.add_nodes_from( [ [ "BUSINESS#" + b["business_id"], {"type": "business", "stars": b["stars"]} ] for b in business ] ) # category {"type": "category"} category_set = set() for b in business: for c in b["categories"]: category_set.add(c) category_list = list(category_set) G.add_nodes_from( [ [ "CATEGORY#" + c, {"type": "category"} ] for c in category_list ] ) # word {"type": "word"} word_list = list(word.itervalues().next().keys()) G.add_nodes_from([ [ "WORD#" + w, {"type": "word"} ] for w in word_list ]) print " add edges..." ''' Edge types: user->business (no weight), business->user (no weight) user1->user2 (no weight) business->category, category->business (no weight) ''' # Add positive user->business and business->user review edges (ratings greater than user average) # A USER CAN WRITE MORE THAN ONE REVIEW ON A BUSINESS!!! user_average_stars = dict() for u in user: user_id = "USER#" + u["user_id"] user_average_stars[user_id] = u["average_stars"] user_business_edges = {} business_user_edges = {} for r in review: business_id = "BUSINESS#" + r["business_id"] user_id = "USER#" + r["user_id"] if ( sign == "pos" and r["stars"] > user_average_stars[user_id] or sign == "neg" and r["stars"] < user_average_stars[user_id] or sign == "all" ): user_business_edges[user_id] = user_business_edges.get(user_id, []) business_user_edges[business_id] = business_user_edges.get(business_id, []) user_business_edges[user_id].append( [ business_id, r['date'] ] ) business_user_edges[business_id].append( [ user_id, r['date'] ] ) for user_id in user_business_edges: for business_id, date in user_business_edges[user_id]: G.add_edge(user_id, business_id, { "weight": 1 / len(user_business_edges[user_id]), "date": date }) for business_id in business_user_edges: for user_id, date in business_user_edges[business_id]: G.add_edge(business_id, user_id, { "weight": 1 / len(business_user_edges[business_id]), "date": date }) # business->word, word->business for b in business: business_id = "BUSINESS#" + b["business_id"] total = sum(word[b["business_id"]].values()) for k, v in word[b["business_id"]].iteritems(): word_id = "WORD#" + k w = v / total if w > 0: G.add_edge(business_id, word_id, {"weight": w}) G.add_edge(word_id, business_id,{"weight": 1}) # Add busines->category, category->business category_business_edges = {} for b in business: business_id = "BUSINESS#" + b["business_id"] for c in b["categories"]: category_id = "CATEGORY#" + c G.add_edge(business_id, category_id, { "weight": 1.0 / len(b["categories"]) }) if category_id not in category_business_edges: category_business_edges[category_id] = list() category_business_edges[category_id].append(business_id) for category_id in category_business_edges: for business_id in category_business_edges[category_id]: G.add_edge(category_id, business_id, { "weight": 1.0 / len(category_business_edges[category_id]) }) # Add user1->user2 friendship edges with no weight specified for u in user: user1_id = "USER#" + u["user_id"] number_of_friends = len(u["friends"]) for f in u["friends"]: user2_id = "USER#" + f G.add_edge(user1_id, user2_id, {"weight": 1 / number_of_friends}) ################################################# ### Test (Set a business to "cold start") ### ################################################# ''' To test a business, remove all the reviews on the test_business_id G.node[test_business_id] = {'stars': 3.0, 'type': 'business'} review_count = 122 ''' # Read business stars print "read business stars..." business_stars = dict() for b in business: business_id = "BUSINESS#" + b["business_id"] business_stars[business_id] = b["stars"] # Read test businesses from input file print "read test business list from " + input_file_path + "..." test_business_list = [] with open(input_file_path) as f_r: line = f_r.readline() while line: test_business_list.append(line[:-1]) line = f_r.readline() # Run PageRank for test businesses f_pr = open(output_file_path1,'w') csv_writer = csv.writer(f_pr) f_w = open(output_file_path, "w") for test_business_original_id in test_business_list: test_business_id = "BUSINESS#" + test_business_original_id print "\ntest business: " + test_business_original_id # Create a new graph from G, remove review edges on test business print " create graph and remove review edges on test business..." G_new = nx.DiGraph(G) remove_edge_list = [] if threshold is not None: # keep the first k reviews print " keep only business-user edges for the first k reviews..." remove_edge_list = [] test_business_edges = G_new.edge[test_business_id] review_user_list = [] for node_id in test_business_edges: if node_id.startswith("USER#"): review_user_list.append([node_id, test_business_edges[node_id]['date']]) review_user_list = sorted(review_user_list, cmp=lambda x,y: compare_date(x[1],y[1])) while len(review_user_list) > threshold: user_id = review_user_list[-1][0] remove_edge_list.append([user_id, test_business_id]) remove_edge_list.append([test_business_id, user_id]) review_user_list.pop() G_new.remove_edges_from(remove_edge_list) # remove all business-word edges on the test business print " remove all business-word edges on test business..." remove_edge_list = [] for r in review: business_id = "BUSINESS#" + r["business_id"] if business_id == test_business_id: for k in word[b["business_id"]].keys(): word_id = "WORD#" + k remove_edge_list.append([business_id, word_id]) remove_edge_list.append([word_id, business_id]) G_new.remove_edges_from(remove_edge_list) # add the top-k business-word edges on the test business print " add business-word edges for the first k reviews..." total = sum(word_topk[test_business_original_id].values()) for k, v in word_topk[test_business_original_id].iteritems(): word_id = "WORD#" + k w = v / total if total > 0 else 0 if w > 0: G_new.add_edge(test_business_id, word_id, {"weight": w}) G_new.add_edge(word_id, test_business_id,{"weight": 1}) else: # remove all business-user edges on test business print " remove all business-user edges on test business..." remove_edge_list = [] for r in review: business_id = "BUSINESS#" + r["business_id"] if business_id == test_business_id: user_id = "USER#" + r["user_id"] remove_edge_list.append([user_id, business_id]) remove_edge_list.append([business_id, user_id]) G_new.remove_edges_from(remove_edge_list) # remove all business-word edges on test business print " remove all business-word edges on test business..." remove_edge_list = [] for r in review: business_id = "BUSINESS#" + r["business_id"] if business_id == test_business_id: for k in word[b["business_id"]].keys(): word_id = "WORD#" + k remove_edge_list.append([business_id, word_id]) remove_edge_list.append([word_id, business_id]) G_new.remove_edges_from(remove_edge_list) # Construct personalization vector for PageRank personalization_dict = dict() for n in G_new.nodes(): personalization_dict[n] = 0 personalization_dict[test_business_id] = 1 # Run PageRank print " run PageRank..." pr = nx.pagerank(G_new, alpha=0.85, personalization=personalization_dict, \ max_iter=300, tol=1e-06, nstart=None, weight='weight', dangling=None) # Output results print " calculate and output results to " + output_file_path + "..." weighted_stars_sum = 0 weight_sum = 0 pr_list = sorted([(k, pr[k]) for k in pr],key=lambda x: -x[1]) for node_id in pr: if node_id.startswith('BUSINESS#') and (node_id != test_business_id) : weighted_stars_sum += pr[node_id] * business_stars[node_id] weight_sum += pr[node_id] predicted_stars = weighted_stars_sum / weight_sum print test_business_original_id + "\t" + \ str(business_stars[test_business_id]) + "\t" + \ str(predicted_stars) f_w.write( test_business_original_id + "\t" + \ str(business_stars[test_business_id]) + "\t" + \ str(predicted_stars) + "\n" ) pr_list = sorted([(k, pr[k]) for k in pr],key=lambda x: -x[1]) for pair in pr_list: if pair[0].startswith('BUSINESS#') and (pair[0] != test_business_id): csv_writer.writerow([test_business_id, pair[0], pair[1]]) f_w.close() f_pr.close() return
k = 10000 w = { 1.0: 1 / 11, 1.5: 1 / 66, 2.0: 1 / 146, 2.5: 1 / 332, 3.0: 1 / 564, 3.5: 1 / 868, 4.0: 1 / 880, 4.5: 1 / 425, 5.0: 1 / 97, } dirname = "yelp_dataset_challenge_academic_dataset/" business_filename = dirname + "yelp_academic_dataset_business_Phoenix.json" business = loadData(business_filename) business_stars = {} for b in business: business_id = "BUSINESS#" + b["business_id"] business_stars[business_id] = b["stars"] test_business = {} with open(input_file_path, 'r') as csv_file: csv_reader = csv.reader(csv_file) for row in csv_reader: b1 = row[0] b2 = row[1] pr = row[2] test_business[b1] = test_business.get(b1, []) test_business[b1].append((b2, pr))
def predict_stars_rw_ubc(input_file_path, output_file_path, output_file_path1, sign, threshold): ########################################## ### Load data (business, user, review) ### ########################################## ''' total number of users: 366715 total number of businesses: 61184 total number of reviews: 1569264 ''' print "load yelp dataset..." dirname = "yelp_dataset_challenge_academic_dataset/" business_filename = dirname + "yelp_academic_dataset_business_Phoenix.json" user_filename = dirname + "yelp_academic_dataset_user_Phoenix.json" review_filename = dirname + "yelp_academic_dataset_review_Phoenix.json" business = loadData(business_filename) user = loadData(user_filename) review = loadData(review_filename) ########################################## ### Build graph ### ########################################## print "build graph..." G = nx.DiGraph() print " add nodes..." ''' Node types: user ("type": "user") business ("type": "business", "stars": 3.5) category ("type": "category") ''' # user {"type": "user"} G.add_nodes_from( [ [ "USER#" + u["user_id"], {"type": "user"} ] for u in user ] ) # business {"type": "business", "stars": 3.5} G.add_nodes_from( [ [ "BUSINESS#" + b["business_id"], {"type": "business", "stars": b["stars"]} ] \ for b in business ] ) # category {"type": "category"} category_set = set() for b in business: for c in b["categories"]: category_set.add(c) category_list = list(category_set) G.add_nodes_from( [ [ "CATEGORY#" + c, {"type": "category"} ] for c in category_list ] ) print " add edges..." ''' business: business->user business->category user: user->business user->user category: category->business ''' normalize_edge = True if normalize_edge: print " (normalize edges: YES)" # (0.5) (0.5) # \ / # User - Business - Category # / \ # # (1) # \ # Business - Category # / # # (0.5) (0.5) # \ / # User - User - Business # / \ # business->category, category->business category_business_edges = dict() for b in business: business_id = "BUSINESS#" + b["business_id"] for c in b["categories"]: category_id = "CATEGORY#" + c G.add_edge(business_id, category_id, { "weight": 0.5 / len(b["categories"]) }) if category_id not in category_business_edges: category_business_edges[category_id] = list() category_business_edges[category_id].append(business_id) for category_id in category_business_edges: for business_id in category_business_edges[category_id]: G.add_edge(category_id, business_id, { "weight": 1.0 / len(category_business_edges[category_id]) }) # Add user->business and business->user review edges (if rating greater than user's average) # A USER CAN WRITE MORE THAN ONE REVIEW ON A BUSINESS!!! user_average_stars = dict() for u in user: user_id = "USER#" + u["user_id"] user_average_stars[user_id] = u["average_stars"] user_business_edges = dict() business_user_edges = dict() for r in review: user_id = "USER#" + r["user_id"] business_id = "BUSINESS#" + r["business_id"] if (sign == 'pos' and r["stars"] > user_average_stars[user_id]) \ or (sign == 'neg' and r["stars"] < user_average_stars[user_id]) \ or (sign == 'all'): if user_id not in user_business_edges: user_business_edges[user_id] = list() if business_id not in business_user_edges: business_user_edges[business_id] = list() user_business_edges[user_id].append( [ business_id, r['date'] ] ) business_user_edges[business_id].append( [ user_id, r['date'] ] ) for user_id in user_business_edges: for business_id, date in user_business_edges[user_id]: G.add_edge(user_id, business_id, { "weight": 0.5 / len(user_business_edges[user_id]), "date": date }) for business_id in business_user_edges: for user_id, date in business_user_edges[business_id]: G.add_edge(business_id, user_id, { "weight": 0.5 / len(business_user_edges[business_id]), "date": date }) # Add user->user1 friendship edges with no weight specified for u in user: user1_id = "USER#" + u["user_id"] for f in u["friends"]: user2_id = "USER#" + f G.add_edge(user1_id, user2_id, { "weight": 0.5 / len(u["friends"]) }) else: print " (normalize edges: NO)" # business->category, category->business for b in business: business_id = "BUSINESS#" + b["business_id"] for c in b["categories"]: category_id = "CATEGORY#" + c G.add_edge(business_id, category_id, {"weight": 1}) G.add_edge(category_id, business_id, {"weight": 1}) # Add user->business and business->user review edges (if rating greater than user's average) # A USER CAN WRITE MORE THAN ONE REVIEW ON A BUSINESS!!! user_average_stars = dict() for u in user: user_id = "USER#" + u["user_id"] user_average_stars[user_id] = u["average_stars"] for r in review: user_id = "USER#" + r["user_id"] business_id = "BUSINESS#" + r["business_id"] if r["stars"] > user_average_stars[user_id]: G.add_edge(user_id, business_id, {"weight": 1}) G.add_edge(business_id, user_id, {"weight": 1}) # Add user->user1 friendship edges with no weight specified for u in user: user1_id = "USER#" + u["user_id"] for f in u["friends"]: user2_id = "USER#" + f G.add_edge(user1_id, user2_id, {"weight": 1}) ################################################# ### Test (Set a business to "cold start") ### ################################################# ''' To test a business, remove all the reviews on the test_business_id G.node[test_business_id] = {'stars': 3.0, 'type': 'business'} review_count = 122 ''' # Read business stars print "read business stars..." business_stars = dict() for b in business: business_id = "BUSINESS#" + b["business_id"] business_stars[business_id] = b["stars"] # Read test businesses from input file print "read test business list from " + input_file_path + "..." test_business_list = [] with open(input_file_path) as f_r: line = f_r.readline() while line: test_business_list.append(line[:-1]) line = f_r.readline() # Run PageRank for test businesses f_pr = open(output_file_path1,'w') csv_writer = csv.writer(f_pr) f_w = open(output_file_path, "w") for test_business_original_id in test_business_list: test_business_id = "BUSINESS#" + test_business_original_id print "\ntest business: " + test_business_original_id # Create a new graph from G print " create new graph" G_new = nx.DiGraph(G) if threshold is not None: # keep the first k reviews print " keep the first k reviews on test business..." remove_edge_list = [] test_business_edges = G_new.edge[test_business_id] review_user_list = [] for node_id in test_business_edges: if node_id.startswith("USER#"): review_user_list.append([node_id, test_business_edges[node_id]['date']]) review_user_list = sorted(review_user_list, cmp=lambda x,y: compare_date(x[1],y[1])) while len(review_user_list) > threshold: user_id = review_user_list[-1][0] remove_edge_list.append([user_id, test_business_id]) remove_edge_list.append([test_business_id, user_id]) review_user_list.pop() G_new.remove_edges_from(remove_edge_list) else: # remove all review edges on test business print " remove all review edges on test business..." remove_edge_list = [] for r in review: business_id = "BUSINESS#" + r["business_id"] if business_id == test_business_id: user_id = "USER#" + r["user_id"] remove_edge_list.append([user_id, business_id]) remove_edge_list.append([business_id, user_id]) G_new.remove_edges_from(remove_edge_list) # Construct personalization vector for PageRank personalization_dict = dict() for n in G_new.nodes(): personalization_dict[n] = 0 personalization_dict[test_business_id] = 1 # Run PageRank print " run PageRank..." pr = nx.pagerank(G_new, alpha=0.85, personalization=personalization_dict, \ max_iter=300, tol=1e-06, nstart=None, weight='weight', dangling=None) # Output results print " calculate and output results to " + output_file_path + "..." weighted_stars_sum = 0 weight_sum = 0 for node_id in pr: if node_id.startswith('BUSINESS#') and (node_id != test_business_id): weighted_stars_sum += pr[node_id] * business_stars[node_id] weight_sum += pr[node_id] predicted_stars = weighted_stars_sum / weight_sum print test_business_original_id + "\t" + \ str(business_stars[test_business_id]) + "\t" + \ str(predicted_stars) f_w.write( test_business_original_id + "\t" + \ str(business_stars[test_business_id]) + "\t" + \ str(predicted_stars) + "\n" ) pr_list = sorted([(k, pr[k]) for k in pr],key=lambda x: -x[1]) for pair in pr_list: if pair[0].startswith('BUSINESS#') and (pair[0] != test_business_id): csv_writer.writerow([test_business_id, pair[0], pair[1]]) f_w.close() f_pr.close()
def predict_stars_rw_ubw(input_file_path, output_file_path): ########################################## ### Load data (business, user, word) ### ########################################## ''' total number of users: 366715 total number of businesses: 61184 total number of reviews: 1569264 ''' print "load yelp dataset..." dirname = "yelp_dataset_challenge_academic_dataset/" business_filename = dirname + "yelp_academic_dataset_business_Phoenix.json" user_filename = dirname + "yelp_academic_dataset_user_Phoenix.json" review_filename = dirname + "yelp_academic_dataset_review_Phoenix.json" business = loadData(business_filename) user = loadData(user_filename) review = loadData(review_filename) word = {} with open("Phoenix.csv", "rb") as csvfile: reader = csv.DictReader(csvfile) for row in reader: business_id = row["business_id"] row.pop("business_id") for k, v in row.iteritems(): row[k] = int(v) word[business_id] = row ########################################## ### Build graph ### ########################################## print "build graph..." G = nx.DiGraph() print " add nodes..." ''' Node types: user ("type": "user") business ("type": "business", "stars": 3.5) word ("type": "category") ''' # user {"type": "user"} G.add_nodes_from( [ [ "USER#" + u["user_id"], {"type": "user"} ] for u in user ] ) # business {"type": "business", "stars": 3.5} G.add_nodes_from( [ [ "BUSINESS#" + b["business_id"], {"type": "business", "stars": b["stars"]} ] \ for b in business ] ) # word {"type": "word"} word_list = list(word.itervalues().next().keys()) G.add_nodes_from([ [ "WORD#" + w, {"type": "word"} ] for w in word_list ]) print " add edges..." ''' Edge types: user->business (no weight), business->user (no weight) user1->user2 (no weight) business->category, category->business (no weight) ''' # Add positive user->business and business->user review edges (ratings greater than user average) # A USER CAN WRITE MORE THAN ONE REVIEW ON A BUSINESS!!! business_review_number = {} user_review_number = {} for r in review: business_id = "BUSINESS#" + r["business_id"] user_id = "USER#" + r["user_id"] business_review_number[business_id] = business_review_number.get(business_id, 0) + 1 user_review_number[user_id] = user_review_number.get(user_id, 0) + 1 user_average_stars = dict() for u in user: user_id = "USER#" + u["user_id"] user_average_stars[user_id] = u["average_stars"] for r in review: user_id = "USER#" + r["user_id"] business_id = "BUSINESS#" + r["business_id"] if r["stars"] > user_average_stars[user_id]: G.add_edge(user_id, business_id, {"weight": 1 / user_review_number[user_id]}) G.add_edge(business_id, user_id, {"weight": 1 / business_review_number[business_id]}) # business->word, word->business for b in business: business_id = "BUSINESS#" + b["business_id"] total = sum(word[b["business_id"]].values()) for k, v in word[b["business_id"]].iteritems(): word_id = "WORD#" + k w = v / total if w > 0: G.add_edge(business_id, word_id, {"weight": w}) G.add_edge(word_id, business_id,{"weight": 1}) # Add user1->user2 friendship edges with no weight specified for u in user: user1_id = "USER#" + u["user_id"] number_of_friends = len(u["friends"]) for f in u["friends"]: user2_id = "USER#" + f G.add_edge(user1_id, user2_id, {"weight": 1 / number_of_friends}) ################################################# ### Test (Set a business to "cold start") ### ################################################# ''' To test a business, remove all the reviews on the test_business_id G.node[test_business_id] = {'stars': 3.0, 'type': 'business'} review_count = 122 ''' # Read business stars print "read business stars..." business_stars = dict() for b in business: business_id = "BUSINESS#" + b["business_id"] business_stars[business_id] = b["stars"] # Read test businesses from input file print "read test business list from " + input_file_path + "..." test_business_list = [] with open(input_file_path) as f_r: line = f_r.readline() while line: test_business_list.append(line[:-1]) line = f_r.readline() # Run PageRank for test businesses f_pr = open('output.phoenix.dev.ubw.pr.txt','w') csv_writer = csv.writer(f_pr) f_w = open(output_file_path, "w") for test_business_original_id in test_business_list: test_business_id = "BUSINESS#" + test_business_original_id print "\ntest business: " + test_business_original_id # Create a new graph from G, remove review edges on test business print " create graph and remove review edges on test business..." G_new = nx.DiGraph(G) remove_edge_list = [] for r in review: business_id = "BUSINESS#" + r["business_id"] if business_id == test_business_id: user_id = "USER#" + r["user_id"] remove_edge_list.append([user_id, business_id]) remove_edge_list.append([business_id, user_id]) G_new.remove_edges_from(remove_edge_list) # Construct personalization vector for PageRank personalization_dict = dict() for n in G_new.nodes(): personalization_dict[n] = 0 personalization_dict[test_business_id] = 1 # Run PageRank print " run PageRank..." pr = nx.pagerank(G_new, alpha=0.85, personalization=personalization_dict, \ max_iter=300, tol=1e-06, nstart=None, weight='weight', dangling=None) # Output results print " calculate and output results to " + output_file_path + "..." weighted_stars_sum = 0 weight_sum = 0 pr_list = sorted([(k, pr[k]) for k in pr],key=lambda x: -x[1]) with open("output_word/output." + test_business_original_id + ".txt","w") as outfile: for pair in pr_list: if pair[0].startswith('BUSINESS#') and (pair[0] != test_business_id): outfile.write(pair[0].replace('BUSINESS#','') + ", " + str(pair[1]) + ", " + str(business_stars[pair[0]]) + '\n') for node_id in pr: if node_id.startswith('BUSINESS#') and (node_id != test_business_id) : weighted_stars_sum += pr[node_id] * business_stars[node_id] weight_sum += pr[node_id] predicted_stars = weighted_stars_sum / weight_sum print test_business_original_id + "\t" + \ str(business_stars[test_business_id]) + "\t" + \ str(predicted_stars) f_w.write( test_business_original_id + "\t" + \ str(business_stars[test_business_id]) + "\t" + \ str(predicted_stars) + "\n" ) pr_list = sorted([(k, pr[k]) for k in pr],key=lambda x: -x[1]) for pair in pr_list: if pair[0].startswith('BUSINESS#') and (pair[0] != test_business_id): csv_writer.writerow([test_business_id, pair[0], pair[1]]) f_w.close() f_pr.close() return