def user_attributes(user_reviews): business_list = json_load.convert(rootdir, 'business.json') company_list = [key for key in user_reviews] attribute_dict = {} for business in business_list: if business['business_id'] in company_list: attributes = business['attributes'] if attributes != None: for attribute in attributes: if attributes[attribute] == 'True': if attribute in attribute_dict: attribute_dict[attribute] += 1 else: attribute_dict[attribute] = 1 for attribute in attribute_dict: attribute_dict[attribute] = (attribute_dict[attribute], attribute_dict[attribute] / len(user_reviews)) return { k: v for k, v in sorted( attribute_dict.items(), reverse=True, key=lambda item: item[1][1]) }
def attribute_matrix_nw(relevant_companies, user_attributes): matrix = pd.DataFrame(index=set(relevant_companies), columns=list(user_attributes)) business_list = json_load.convert(rootdir, 'business.json') for business in business_list: if business['business_id'] in matrix.index: for attribute in matrix.columns: matrix[attribute][business['business_id']] = 0 if business['attributes'] != None: if attribute in business['attributes']: if business['attributes'][attribute] == 'True': matrix[attribute][business['business_id']] = 1 return matrix
def relevant_companies(user_reviews): business_list = json_load.convert(rootdir, 'business.json') company_list = [] category_list = [] for review in user_reviews: category_list.extend(user_reviews[review][1]) category_set = set(category_list) for business in business_list: for category in category_set: if business["categories"] != None: if category in business["categories"]: company_list.append(business["business_id"]) return company_list
def user_reviews(user, data): review_list = data business_list = json_load.convert(rootdir, 'business.json') business_dict = {} for business in business_list: business_dict[business["business_id"]] = business["categories"] user_reviews = { review['business_id']: (review['stars'], [ category.strip() for category in business_dict[review['business_id']].split(',') ]) for review in review_list if review['user_id'] == user and review['stars'] >= 3 } return user_reviews
def random_recomendations(company_dict, user_reviews): business_list = json_load.convert(rootdir, 'business.json') for company in business_list: if company['business_id'] in company_dict: if len(user_reviews) > 9: company_dict[company['business_id']] = ( company_dict[company['business_id']] * 0.7) * (company['stars'] * 0.3) elif len(user_reviews) > 4: company_dict[company['business_id']] = ( company_dict[company['business_id']] * 0.5) * (company['stars'] * 0.5) return_dict = {} for x in range(20): company = random.choice(list(company_dict)) return_dict[company] = company_dict[company] return return_dict
def user_recomendations(company_dict, user_reviews): business_list = json_load.convert(rootdir, 'business.json') for company in business_list: if company['business_id'] in company_dict: if len(user_reviews) > 9: company_dict[company['business_id']] = ( company_dict[company['business_id']] * 0.7) * (company['stars'] * 0.3) elif len(user_reviews) > 4: company_dict[company['business_id']] = ( company_dict[company['business_id']] * 0.5) * (company['stars'] * 0.5) return dict( itertools.islice({ k: v for k, v in sorted( company_dict.items(), reverse=True, key=lambda item: item[1]) }.items(), 20))
def user_list(): return [ user['user_id'] for user in json_load.convert(rootdir, 'user.json') ]
import numpy as np import json import os import json_load import collections import pandas as pd import itertools import random import statistics import helperfunctions as hf rootdir = './data' df_training, df_test = hf.split_data( pd.DataFrame(json_load.convert('./data', 'review.json'))) training_json = df_training.to_json(orient='records') test_json = df_test.to_json(orient='records') trainingdata = json.loads(training_json) testdata = json.loads(test_json) #print(training_json) # Returns a list with user id's from the region def user_list(): return [ user['user_id'] for user in json_load.convert(rootdir, 'user.json') ] # Returns a dict with review scores from the user
import numpy as np import matplotlib.pyplot as plt import json import os import json_load rootdir = './data' file = 'user.json' json_data = json_load.convert(rootdir, file) review = {} #Create feature plot for data in json_data: key = round(data["average_stars"], 1) review[key] = data['useful'] lists = sorted(review.items()) # sorted by key, return a list of tuples x, y = zip(*lists) # unpack a list of pairs into two tuples plt.plot(x, y) plt.title('Ten cities\nUsefulness based on average stars\ntested on') plt.xlabel('Average stars') plt.ylabel('Noted as useful') plt.savefig('./plots/user_stars_usefull.png') plt.show()
userId = "t-nB38eHbeFuabXBdJMwvg" skip = False picklesFound = hf.checkForPickle('./pickles') if(picklesFound == True): answer = input('Already found matrices. Do you want to use these? Type yes/no\n') if(answer.lower() == 'yes'): skip = True else: print('No pickle files found that contain matrices. Creating new matrices now..............') skip = False df_training = None df_test = None if (skip == False): data = pd.DataFrame(json_load.convert('./data', 'review.json')) data.drop_duplicates(subset =["business_id","user_id"], keep = False, inplace = True) df_training, df_test = hf.split_data(data) df_training.to_pickle('./pickles/df_training.pkl') df_test.to_pickle('./pickles/df_test.pkl') print('trainingset' + str(len(df_training))) print('testset' + str(len(df_test))) #utility matrix: utility_review = hf.pivot_ratings(df_training) utility_review.to_pickle('./pickles/utility.pkl') #mean center utility matrix centered = hf.mean_center_columns(utility_review)