def get_bipartite_graph(NUM_REVIEWS = 3000): businesses = load_data.load_objects("business") print "businesses loaded: " + str(time.clock() - start) users = load_data.load_objects("user") print "users loaded: " + str(time.clock() - start) reviews = load_data.load_objects("review", NUM_REVIEWS) print "reviews loaded: " + str(time.clock() - start) business_dict = {} for b in businesses: business_dict[b.business_id] = b user_dict = {} for u in users: user_dict[u.user_id] = u print "dicts loaded: " + str(time.clock() - start) G = nx.DiGraph() #G = nx.Graph() for index, r in enumerate(reviews): if (index % 100) == 0: print index if r.user_id in user_dict.keys() and b.business_id in business_dict.keys(): user = user_dict[r.user_id] business = business_dict[r.business_id] G.add_edge(business, user) #G.edge[business][user]['weight'] = r.stars # r.stars or r.votes[funny, useful, cool] print "graph fully loaded: " + str(time.clock() - start) return G
def get_temporal_business_graph(): businesses = load_data.load_objects("business") users = load_data.load_objects("user") reviews = load_data.load_objects("review", 10000) business_dict = {} for b in businesses: business_dict[b.business_id] = b user_dict = {} for u in users: user_dict[u.user_id] = u # create dictionary of user_id -> [review, ...] review_dict = {} for review in reviews: if "Restaurants" not in business_dict[review.business_id].categories: continue if review.user_id in review_dict.keys(): review_dict[review.user_id] += [review] else: review_dict[review.user_id] = [review] G = nx.DiGraph() for user_id, review_list in review_dict.items(): review_list = sorted(review_list, key=lambda review: review.date) for i in range(len(review_list) - 1): d1 = parser.parse(review_list[i].date) d2 = parser.parse(review_list[i + 1].date) b1 = business_dict[review_list[i].business_id] b2 = business_dict[review_list[i + 1].business_id] if G.has_edge(b1, b2): G.edge[b1][b2]["weight"] += 1 else: G.add_edge(b1, b2) G.edge[b1][b2]["weight"] = 1 """ b1 = business_dict[review_list[i].business_id] b2 = business_dict[review_list[i+1].business_id] for c1 in b1.categories: for c2 in b2.categories: if G.has_edge(c1, c2): G.edge[c1][c2]['weight'] += 1 else: G.add_edge(c1, c2) G.edge[c1][c2]['weight'] = 1 """ print "graph created" return G
def get_temporal_business_graph(): businesses = load_data.load_objects("business") users = load_data.load_objects("user") reviews = load_data.load_objects("review", 10000) business_dict = {} for b in businesses: business_dict[b.business_id] = b user_dict = {} for u in users: user_dict[u.user_id] = u # create dictionary of user_id -> [review, ...] review_dict = {} for review in reviews: if 'Restaurants' not in business_dict[review.business_id].categories: continue if review.user_id in review_dict.keys(): review_dict[review.user_id] += [review] else: review_dict[review.user_id] = [review] G = nx.DiGraph() for user_id, review_list in review_dict.items(): review_list = sorted(review_list, key=lambda review: review.date) for i in range(len(review_list) - 1): d1 = parser.parse(review_list[i].date) d2 = parser.parse(review_list[i+1].date) b1 = business_dict[review_list[i].business_id] b2 = business_dict[review_list[i+1].business_id] if G.has_edge(b1, b2): G.edge[b1][b2]['weight'] += 1 else: G.add_edge(b1, b2) G.edge[b1][b2]['weight'] = 1 ''' b1 = business_dict[review_list[i].business_id] b2 = business_dict[review_list[i+1].business_id] for c1 in b1.categories: for c2 in b2.categories: if G.has_edge(c1, c2): G.edge[c1][c2]['weight'] += 1 else: G.add_edge(c1, c2) G.edge[c1][c2]['weight'] = 1 ''' print "graph created" return G
def run(): businesses = load_data.load_objects("business") business_dict = {} for b in businesses: business_dict[b.business_id] = b reviews = load_data.load_objects("review", 25000) G = nx.DiGraph() for review in reviews: b = business_dict[review.business_id] if 'Restaurants' not in b.categories: continue text = re.sub('[^a-zA-Z0-9\n]', ' ', review.text) words_used = [] for word in re.split(" ", text): word = word.lower() if word in words_used: # only count one word occurence per review continue words_used += [word] if (review.stars not in G): G.add_node(review.stars) if (word not in G): G.add_node(word) if not G.has_edge(word, review.stars): G.add_edge(word, review.stars) G.edge[word][review.stars]['weight'] = 1 else: G.edge[word][review.stars]['weight'] += 1 indicators = [] for word in G: if G.out_degree(word) > 0: # is a word max_stars, max_weight = 0, 0 # find which rating this word is most indicative of for stars in G.neighbors(word): if G.edge[word][stars]['weight'] > max_weight: max_stars = stars max_weight = G.edge[word][stars]['weight'] # find how indicative this word is of that rating (and ratings nearby) indicator = max_weight for stars in G.neighbors(word): indicator -= abs(stars - max_stars) * G.edge[word][stars]['weight'] indicators += [(word, indicator, max_stars)] # sort by best indicators indicators = sorted(indicators, key=lambda tuple: tuple[1], reverse=True) return indicators
def run(): businesses = load_data.load_objects("business") business_dict = {} for b in businesses: business_dict[b.business_id] = b reviews = load_data.load_objects("review", 25000) G = nx.DiGraph() for review in reviews: b = business_dict[review.business_id] if 'Restaurants' not in b.categories: continue text = re.sub('[^a-zA-Z0-9\n]', ' ', review.text) words_used = [] for word in re.split(" ", text): word = word.lower() if word in words_used: # only count one word occurence per review continue words_used += [word] if (review.stars not in G): G.add_node(review.stars) if (word not in G): G.add_node(word) if not G.has_edge(word, review.stars): G.add_edge(word, review.stars) G.edge[word][review.stars]['weight'] = 1 else: G.edge[word][review.stars]['weight'] += 1 indicators = [] for word in G: if G.out_degree(word) > 0: # is a word max_stars, max_weight = 0, 0 # find which rating this word is most indicative of for stars in G.neighbors(word): if G.edge[word][stars]['weight'] > max_weight: max_stars = stars max_weight = G.edge[word][stars]['weight'] # find how indicative this word is of that rating (and ratings nearby) indicator = max_weight for stars in G.neighbors(word): indicator -= abs(stars - max_stars) * G.edge[word][stars]['weight'] indicators += [(word, indicator, max_stars)] # sort by best indicators indicators = sorted(indicators, key = lambda tuple: tuple[1], reverse=True) return indicators
def get_graph(NUM_REVIEWS=30000): businesses = load_data.load_objects("business") business_dict = {} for b in businesses: business_dict[b.business_id] = b reviews = load_data.load_objects("review", NUM_REVIEWS) # first, create a mapping of user -> [businesses rated] # second, create a mapping of (b1, b2) -> number of users rating both first = {} for index, review in enumerate(reviews): if index % 1000 == 0: print index if 'Restaurants' not in business_dict[review.business_id].categories: continue if review.user_id in first.keys(): first[review.user_id] += [review.business_id] else: first[review.user_id] = [review.business_id] print "number of users: " + str(len(first)) second = {} for user_id in first.keys(): for b1 in first[user_id]: for b2 in first[user_id]: if b1 != b2: # for b1=123, b2=524, we use key = 123_AND_524 key = [b1, b2] sorted(key) key = '_AND_'.join(key) if set(key) in second.keys(): second[key] += 1 else: second[key] = 1 print "number of business pairs: " + str(len(second)) G = nx.Graph() for key in second.keys(): if second[key] > 1: print key b1_id, b2_id = key.split("_AND_") G.add_edge(business_dict[b1_id], business_dict[b2_id]) return G
def get_graph(NUM_REVIEWS=30000): businesses = load_data.load_objects("business") business_dict = {} for b in businesses: business_dict[b.business_id] = b reviews = load_data.load_objects("review", NUM_REVIEWS) # first, create a mapping of user -> [businesses rated] # second, create a mapping of (b1, b2) -> number of users rating both first = {} for index, review in enumerate(reviews): if index%1000 == 0: print index if 'Restaurants' not in business_dict[review.business_id].categories: continue if review.user_id in first.keys(): first[review.user_id] += [review.business_id] else: first[review.user_id] = [review.business_id] print "number of users: " + str(len(first)) second = {} for user_id in first.keys(): for b1 in first[user_id]: for b2 in first[user_id]: if b1 != b2: # for b1=123, b2=524, we use key = 123_AND_524 key = [b1, b2] sorted(key) key = '_AND_'.join(key) if set(key) in second.keys(): second[key] += 1 else: second[key] = 1 print "number of business pairs: " + str(len(second)) G = nx.Graph() for key in second.keys(): if second[key] > 1: print key b1_id, b2_id = key.split("_AND_") G.add_edge(business_dict[b1_id], business_dict[b2_id]) return G
def run(): businesses = load_data.load_objects("business") users = load_data.load_objects("user") reviews = load_data.load_objects("review", 100000) business_dict = {} for b in businesses: business_dict[b.business_id] = b user_dict = {} for u in users: user_dict[u.user_id] = u # user gave 4.5, usually gives 3 b_dict = {} # business -> (total_reviewer_plus_or_minus, num_reviews_seen) for review in reviews: if review.business_id not in business_dict or review.user_id not in user_dict: continue b = business_dict[review.business_id] u = user_dict[review.user_id] diff = review.stars - u.average_stars if b in b_dict: b_dict[b] = (b_dict[b][0] + diff, b_dict[b][1] + 1) else: b_dict[b] = (diff, 1) normalized_businesses = [] for b in b_dict: diff, count = b_dict[b] if count > 1: new_rating = b.stars + diff normalized_businesses += [(b, new_rating, count)] normalized_businesses = sorted(normalized_businesses, key=lambda t: t[1], reverse=True) print "\n\nunderrated businesses" for b, rating, count in normalized_businesses[:20]: print '{:<80}'.format( str(b) ), "\t\t\t", rating, "\t\t\t", count, "/", b.review_count, "\t\t\t", b.stars print "\n\noverrated businesses" for b, rating, count in normalized_businesses[-20:]: print '{:<80}'.format( str(b) ), "\t\t\t", rating, "\t\t\t", count, "/", b.review_count, "\t\t\t", b.stars
def run(): businesses = load_data.load_objects("business") users = load_data.load_objects("user") reviews = load_data.load_objects("review", 100000) business_dict = {} for b in businesses: business_dict[b.business_id] = b user_dict = {} for u in users: user_dict[u.user_id] = u # user gave 4.5, usually gives 3 b_dict = {} # business -> (total_reviewer_plus_or_minus, num_reviews_seen) for review in reviews: if review.business_id not in business_dict or review.user_id not in user_dict: continue b = business_dict[review.business_id] u = user_dict[review.user_id] diff = review.stars - u.average_stars if b in b_dict: b_dict[b] = (b_dict[b][0] + diff, b_dict[b][1] + 1) else: b_dict[b] = (diff, 1) normalized_businesses = [] for b in b_dict: diff, count = b_dict[b] if count > 1: new_rating = b.stars + diff normalized_businesses += [(b, new_rating, count)] normalized_businesses = sorted(normalized_businesses, key=lambda t: t[1], reverse=True) print "\n\nunderrated businesses" for b, rating, count in normalized_businesses[:20]: print '{:<80}'.format(str(b)), "\t\t\t", rating, "\t\t\t", count, "/", b.review_count, "\t\t\t", b.stars print "\n\noverrated businesses" for b, rating, count in normalized_businesses[-20:]: print '{:<80}'.format(str(b)), "\t\t\t", rating, "\t\t\t", count, "/", b.review_count, "\t\t\t", b.stars
def aggregate_objects(data_flags, skip_missing=False): """ Aggregate CS objects from separate .pklz files to a single .pklz file. Args: data_flags: Identifiers for saving and loading. """ if skip_missing == True: print("Skipping missing files...will populate with `None`") if isinstance(data_flags, str): data_flags = [data_flags] for data_flag in data_flags: list_dict = read_specs_file(data_flag) iter_vars = list_dict['iter_vars'] iter_vars_dims = [] for iter_var in iter_vars: iter_vars_dims.append(len(iter_vars[iter_var])) it = sp.nditer(sp.zeros(iter_vars_dims), flags=['multi_index']) obj_list = [] while not it.finished: sys.stdout.flush() print(it.multi_index) if skip_missing == False: CS_obj = load_objects(list(it.multi_index), data_flag) else: try: CS_obj = load_objects(list(it.multi_index), data_flag) except (IOError, OSError): print('Skipping item %s...' % list(it.multi_index)) CS_obj = None obj_list.append(CS_obj) it.iternext() save_aggregated_object_list(obj_list, data_flag)
''' generate reviews for a given ''' from objects import * from networkx import * from helpers import * import load_data import re, random, unicodedata NUM_STATES = 2 # number of previous words to use as state NUM_REVIEWS = 100000 PUNCTUATION = ['.', '!', '?'] businesses = load_data.load_objects("business") users = load_data.load_objects("user") reviews = load_data.load_objects("review", NUM_REVIEWS) print NUM_REVIEWS,"reviews loaded" business_dict = {} for b in businesses: business_dict[b.business_id] = b user_dict = {} for u in users: user_dict[u.user_id] = u # creates a text file with the appropriate reviews: def make_reviews_file(stars=None, min_stars=None, max_stars=None, category=None): F_NAME = 'reviews_file' f = open(F_NAME, 'w') num_reviews = 0
''' A matrix of full connectivity of businesses where the edge weights are the distance between two businesses ''' import load_data import math, numpy as np from helpers import * #1 degree of latitude is approx 69 miles n = 500 pre_businesses = load_data.load_objects("business", n) businesses=[] for b in pre_businesses: if 'Restaurants' in b.categories: businesses += [b] n = len(businesses) for b in businesses: print "rating for ",str(b), ":", b.stars, ", # reviews: ", b.review_count #users = load_data.load_objects("user", 5000) #reviews = load_data.load_objects("review", 50000) A = [] print 'starting script' for i in range(n): bus = businesses[i] dists = [] for j in range(n): if (i==j):
def aggregate_temporal_entropy_objects(data_flags): """ Aggregate CS objects from separate .pklz files of temporal runs to a single .pklz object. Args: data_flags: Identifiers for saving and loading. """ temporal_structs_to_save = ['entropy'] if isinstance(data_flags, str): data_flags = [data_flags] for data_flag in data_flags: list_dict = read_specs_file(data_flag) iter_vars = list_dict['iter_vars'] iter_vars_dims = [] for iter_var in iter_vars: iter_vars_dims.append(len(iter_vars[iter_var])) it = sp.nditer(sp.zeros(iter_vars_dims), flags=['multi_index']) CS_init_array = load_objects(list(it.multi_index), data_flag) # Dictionary to save all object at time 0; this will contain all # non-temporal info for each iterated variable. data = dict() data['init_objs'] = [] nT = len(CS_init_array[0].signal_trace_Tt) # Assign data structures of appropriate shape for the temporal variable structs = dict() for struct_name in temporal_structs_to_save: try: tmp_str = 'structs[struct_name] = CS_init_array[0].%s' \ % struct_name exec(tmp_str) except: print('%s not an attribute of the CS object' % struct_name) continue # shape is (num timesteps, iterated var ranges, variable shape); # if a float or integer, shape is just time and iter vars. struct_shape = (nT, ) + tuple(iter_vars_dims) if hasattr(structs[struct_name], 'shape'): struct_shape += (structs[struct_name].shape) data['%s' % struct_name] = sp.zeros(struct_shape) # Iterate over all objects to be aggregated structs = dict() while not it.finished: print('Loading index:', it.multi_index) temporal_CS_array = load_objects(list(it.multi_index), data_flag) # Save full object at time 0, contains non-temporal data. data['init_objs'].append(temporal_CS_array[0]) # Grab all the temporal structures, timepoint-by-timepoint for iT in range(nT): full_idx = (iT, ) + it.multi_index for struct_name in temporal_structs_to_save: tmp_str = 'structs[struct_name] = temporal_CS_array[iT].%s' \ % struct_name exec(tmp_str) data[struct_name][full_idx] = structs[struct_name] it.iternext() save_aggregated_temporal_objects(data, data_flag)
''' A matrix of full connectivity of businesses where the edge weights are the distance between two businesses ''' import load_data import math, numpy as np from helpers import * #1 degree of latitude is approx 69 miles n = 500 pre_businesses = load_data.load_objects("business", n) businesses = [] for b in pre_businesses: if 'Restaurants' in b.categories: businesses += [b] n = len(businesses) for b in businesses: print "rating for ", str(b), ":", b.stars, ", # reviews: ", b.review_count #users = load_data.load_objects("user", 5000) #reviews = load_data.load_objects("review", 50000) A = [] print 'starting script' for i in range(n): bus = businesses[i] dists = [] for j in range(n): if (i == j):
''' Create a bipartite graphs of words in review text to reviews that contain them. ''' from objects import * from networkx import * from helpers import * import load_data import re reviews = load_data.load_objects("review", 1000) G = nx.Graph() init_weights = {} for review in reviews: for word in re.split(" |\. |\! ", review.text): word = word.lower() G.add_edge(word, review) G.edge[word][review]['weight'] = review.stars #init_weights[review] = review.stars #init_weights[word] = 1 # pick a centrality #centrality = nx.degree_centrality(G) #centrality = nx.betweenness_centrality(G) #centrality = nx.closeness_centrality(G, distance=True) #centrality = nx.eigenvector_centrality(G, tol=.01) #centrality = nx.pagerank(G, personalization=init_weights) just_words = {}