def prepare(): for area in mongo_functions.mongo_get(collection='pre_metropolitan_area'): business = mongo_functions.mongo_get( collection='pre_business', filter={'tile15': { '$in': area['city_tiles15'] }}, fields={ 'tile10': 1, 'tile15': 1, 'tile18': 1, 'norm_categories': 1 }) business_ids = [doc['_id'] for doc in business] reviews = mongo_functions.mongo_get( collection='review', filter={'business_id': { '$in': business_ids }}) business_dict = {doc['_id']: doc for doc in business} for review in reviews: review['tile10'] = business_dict[review['business_id']]['tile10'] review['tile15'] = business_dict[review['business_id']]['tile15'] review['tile18'] = business_dict[review['business_id']]['tile18'] review['norm_categories'] = business_dict[ review['business_id']]['norm_categories'] review['city_area'] = area['_id'] mongo_functions.batch_upsert(reviews, collection='pre_review', update="{'$set': item}")
def prepare(): areas = mongo_functions.mongo_get(collection='pre_metropolitan_area') for area in areas: businesses = mongo_functions.mongo_get( collection='pre_business', filter={'tile10': { '$in': area['tiles'] }}) tile15_dict = _group_business_by_tile15(businesses) tile15_ordered_list = sorted([(key, value) for key, value in tile15_dict.items()], key=lambda x: len(x[1]), reverse=True) city_area = _group(set([tile for tile in tile15_dict.keys()]), tile15_ordered_list[0][0]) city_business = [] for tile in city_area[0]: city_business += tile15_dict[tile] # mongo_functions.batch_upsert(city_business, collection='pre_city_business', update="{'$set': item}") area['city_center'] = list( tile_functions.tile_center( int(tile15_ordered_list[0][0].split('_')[0]), int(tile15_ordered_list[0][0].split('_')[1]), 15)) area['city_tiles15'] = city_area[0] area['city_businesses'] = len(city_business) mongo_functions.batch_update(areas, collection='pre_metropolitan_area', update="{'$set': item}")
def export_city_business(): categories = _business_categories_get() csv_fields = [ '_id', 'area_id', 'ratio', 'name', 'review_count', 'stars', 'categories' ] + categories for area in mongo_functions.mongo_get(collection='pre_metropolitan_area'): business_dict = {} _business = mongo_functions.mongo_get(collection='pre_business', filter={ 'tile10': { '$in': area['tiles'] }, 'raw_ratio': { '$exists': True } }, fields={ 'categories': 1, 'name': 1, 'review_count': 1, 'stars': 1, 'norm_categories': 1, 'raw_ratio': 1 }) _business_dict_build(_business, business_dict, categories, area, csv_fields) lines = [';'.join(line) for line in business_dict.values()] content = ';'.join(csv_fields) + '\n' + '\n'.join(lines) write_csv_file( '{}_business_ratio.csv'.format(area['_id'].replace(' ', '_')), content)
def main(): for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={}): reviews = mongo_functions.mongo_get(collection='pre_review', filter={ 'city_area': area['_id'], 'user_from': { '$exists': True } }, fields={ 'tile_id': 1, 'user_id': 1, 'tile15': 1, 'tile18': 1, 'user_from': 1 }) city_unique_tourist = city_unique_tourist_get(reviews, area) city_unique_residents = city_unique_residents_get(reviews, area) total_tourist = len(city_unique_tourist) total_residents = len(city_unique_residents) for zoom in [15, 18]: tile_reviews_dict = tile_reviews_dict_group(reviews, zoom) pre_tile_list = [] for tile, tile_reviews in tile_reviews_dict.items(): tourist_reviews = [ review['user_id'] for review in tile_reviews if review['user_from'] != area['_id'] ] resident_reviews = [ review['user_id'] for review in tile_reviews if review['user_from'] == area['_id'] ] pre_tile = { '_id': '{}_{}'.format(tile, zoom), 'area_id': area['_id'], 'tourist_unique': list(set(tourist_reviews)), 'resident_unique': list(set(resident_reviews)), 'tourist_review_counter': len(tourist_reviews), 'resident_review_counter': len(resident_reviews) } pre_tile['tourist_unique_counter'] = len( pre_tile['tourist_unique']) pre_tile['resident_unique_counter'] = len( pre_tile['resident_unique']) pre_tile['ratio_signed'] = tile_dissimilarity_ratio( pre_tile['tourist_unique_counter'], pre_tile['resident_unique_counter'], total_tourist, total_residents) pre_tile['ratio'] = pre_tile['ratio_signed'] if pre_tile[ 'ratio_signed'] > 0 else -1 * pre_tile['ratio_signed'] pre_tile_list.append(pre_tile) mongo_functions.batch_upsert(pre_tile_list, collection='pre_tile', update='{"$set": item}')
def prepare(): areas = mongo_functions.mongo_get(collection='pre_metropolitan_area') for area in areas: businessess = mongo_functions.mongo_get( collection='pre_business', filter={'tile10': { '$in': area['tiles'] }}) for zoom in ZOOMS: features = features_prepare(businessess, zoom) geojson_file_create(area['_id'], zoom, features) areas_json_create(areas)
def prepare(): for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={}): reviews = mongo_functions.mongo_get(collection='pre_review', filter={'city_area': area['_id']}, fields={'user_id': 1}) try: user_ids = list({review['user_id'] for review in reviews}) except Exception as e: print(e) return try: user_type_dict = {} while len(user_ids) > 0: current_user_ids = user_ids[:100000] user_ids = user_ids[100000:] user_type_dict.update({ user['_id']: user['local'] for user in mongo_functions.mongo_get( collection='pre_user', filter={ '_id': { '$in': current_user_ids }, 'local': { '$exists': True } }, fields={'local': 1}) }) except Exception as e: print(e) return _reviews = [] while len(reviews) > 0: review = reviews.pop() try: review['user_from'] = user_type_dict[review['user_id']] _reviews.append(review) except KeyError: pass except Exception as e: print(e) pass mongo_functions.batch_update( _reviews, collection='pre_review', update='{"$set": {"user_from": item["user_from"]}}')
def yearly_dissimilarity_ratio(): for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={}): business_dict = {} for year, reviews in _reviews_per_year(area).items(): city_unique_visitors = city_unique_visitors_get(reviews, area) city_unique_residents = city_unique_residents_get(reviews, area) total_visitors = len(city_unique_visitors) total_residents = len(city_unique_residents) business_reviews_dict = business_reviews_dict_group(reviews) for business_id, business_reviews in business_reviews_dict.items(): if business_id not in business_dict: business_dict[business_id] = { '_id': business_id, 'ratio_yearly': {} } business_dict[business_id]['ratio_yearly'][str(year)] = float( '{0:.9f}'.format( business_dissimilarity_ratio(business_reviews, total_visitors, total_residents, area))) business_list = [item for item in business_dict.values()] mongo_functions.batch_update( business_list, collection='pre_business', update='{"$set": {"ratio_yearly": item["ratio_yearly"]}}')
def process(): # as step 1 in `reviewer_classifier` is to discard the users with `review_count` less than 2 -> we already filter the users users = mongo_functions.mongo_get( collection='pre_user', filter={'review_count': {'$gt': 2}}, fields={'reviews': 1, 'review_count': 1, 'grouped_reviews': 1}, page_size=2000000 ) located_users = [] index = 0 users_len = len(users) classifier_codes = {} print('\n') for user in users: index += 1 sys.stdout.write('\rProcessing user {}/{}...'.format(index, users_len)) sys.stdout.flush() user_location, code = reviewer_classifier(user) if code is not None: if code not in classifier_codes: classifier_codes[code] = 0 classifier_codes[code] += 1 if user_location is None: continue user['local'] = user_location located_users.append(user) ordered_classification_codes = sorted([(key, value) for key, value in classifier_codes.items()], key=lambda x: x[0]) print('\nSummary:') for item in ordered_classification_codes: print('{} users classified as: {}'.format(item[1], classifier_code_map[item[0]])) mongo_functions.batch_update(located_users, collection='pre_user', update='{"$set": item}')
def prepare(): businesses = mongo_functions.mongo_get(collection='business') pre_businessess = [] for business in businesses: try: business['tile10'] = '_'.join([ str(item) for item in tiles.deg2num(business['latitude'], business['longitude'], 10) ]) business['tile15'] = '_'.join([ str(item) for item in tiles.deg2num(business['latitude'], business['longitude'], 15) ]) business['tile18'] = '_'.join([ str(item) for item in tiles.deg2num(business['latitude'], business['longitude'], 18) ]) pre_businessess.append(business) except TypeError: pass except Exception as e: raise mongo_functions.batch_upsert(pre_businessess, collection='pre_business', update='{"$set": item}')
def metropolitan_area_center(metropolitan_area): businesses = mongo_functions.mongo_get( collection='pre_business', filter={'tile10': {'$in': metropolitan_area['tiles']}}, fields={'latitude': 1, 'longitude': 1} ) center_lat, center_lon = center(businesses) metropolitan_area['center'] = [center_lat, center_lon]
def prepare(): tiles = {doc['tile10'] for doc in mongo_functions.mongo_get(collection='pre_business', fields={'tile10': 1})} tiles_sorted = sorted(tiles, key=lambda x: x.split('_')[0]) tiles_grouped = tiles_group(tiles_sorted) metropolitan_areas = groups_filter(tiles_grouped) for metropolitan_area in metropolitan_areas: metropolitan_area_center(metropolitan_area) metropolitan_area_name(metropolitan_area) mongo_functions.batch_upsert(metropolitan_areas, collection='pre_metropolitan_area', update="{'$set': item}")
def prepare(): areas = mongo_functions.mongo_get(collection='pre_metropolitan_area') for area in areas: businessess = mongo_functions.mongo_get( collection='pre_business', filter={'tile10': { '$in': area['tiles'] }}) for zoom in ZOOMS: business_counter_tiled = business_count_per_tile(businessess, zoom) polygons = [] for tile in business_counter_tiled: polygons.append( map_functions.geojson_polygon_formatter( {'business': tile['business']}, tile['tile'], zoom)) geojson_file_create(area['_id'], zoom, polygons) areas_json_create(areas)
def city_unique_users_get(): for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={}): reviews = mongo_functions.mongo_get(collection='pre_review', filter={ 'city_area': area['_id'], 'user_from': { '$exists': True } }, fields={ 'user_from': 1, 'user_id': 1 }) city_unique_visitors = city_unique_visitors_get(reviews, area) city_unique_residents = city_unique_residents_get(reviews, area) total_visitors = len(city_unique_visitors) total_residents = len(city_unique_residents) print('{} unique visitors -> {}'.format(area['_id'], total_visitors)) print('{} unique residents -> {}'.format(area['_id'], total_residents))
def user_reviews_dict_create(): reviews = mongo_functions.mongo_get( collection='pre_review', filter={'city_area': { '$exists': True }}) user_reviews_dict = {} for review in reviews: if review['user_id'] not in user_reviews_dict: user_reviews_dict[review['user_id']] = [] user_reviews_dict[review['user_id']].append(review) return user_reviews_dict
def _get_all_years(): global years _years = set([]) business_years_reviews = mongo_functions.mongo_get( collection='pre_business', filter={'ratio_yearly': { '$exists': True }}, fields={'ratio_yearly': 1}) for business in business_years_reviews: [_years.add(year) for year in business['ratio_yearly'].keys()] years = list(_years)
def main(): metropolitan_area_counter = 0 city_counter = 0 for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={ 'businesses': 1, 'city_businesses': 1 }): metropolitan_area_counter += area['businesses'] city_counter += area['city_businesses'] print('tiles in metropolitan_area {}'.format(metropolitan_area_counter)) print('tiles in cities {}'.format(city_counter))
def prepare(): category_dict = category_dict_prepare() for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={ 'tiles': 1, 'name': 1 }): print('processing business for ', area['_id']) business = mongo_functions.mongo_get( collection='pre_business', filter={"tile10": { '$in': area['tiles'] }}, fields={ 'name': 1, 'categories': 1 }) business_prepare(business, category_dict) mongo_functions.batch_update( business, collection='pre_business', update='{"$set": {"norm_categories": item["norm_categories"]}}')
def prepare(): areas = mongo_functions.mongo_get(collection='pre_metropolitan_area') for area in areas: businesses = mongo_functions.mongo_get(collection='pre_business', filter={ 'tile10': { '$in': area['tiles'] }, 'ratio_yearly': { '$exists': True } }, fields={ 'ratio_yearly': 1, 'tile15': 1, 'tile18': 1, 'norm_categories': 1 }) for zoom in ZOOMS: ratios_dict = ratios_group(businesses, zoom) features = features_prepare(ratios_dict, zoom) geojson_file_create(area['_id'], zoom, features)
def main(): for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={}): reviews = mongo_functions.mongo_get(collection='pre_review', filter={ 'city_area': area['_id'], 'user_from': { '$exists': True } }, fields={ 'business_id': 1, 'user_id': 1, 'tile15': 1, 'tile18': 1, 'user_from': 1 }) city_unique_visitors = city_unique_visitors_get(reviews, area) city_unique_residents = city_unique_residents_get(reviews, area) total_visitors = len(city_unique_visitors) total_residents = len(city_unique_residents) business_reviews_dict = business_reviews_dict_group(reviews) business_list = [] for business_id, business_reviews in business_reviews_dict.items(): business = { '_id': business_id, 'raw_ratio': business_dissimilarity_ratio(business_reviews, total_visitors, total_residents, area) } business_list.append(business) mongo_functions.batch_update( business_list, collection='pre_business', update='{"$set": {"raw_ratio": item["raw_ratio"]}}')
def metropolitan_area_name(metropolitan_area): city_names = [doc['city'] for doc in mongo_functions.mongo_get( collection='pre_business', filter={'tile10': {'$in': metropolitan_area['tiles']}}, fields={'city': 1} )] names = {} for city in city_names: if city not in names: names[city] = 0 names[city] += 1 names_list = [{'city': city, 'count': count} for city, count in names.items()] names_list.sort(key=lambda x: x['count'], reverse=True) metropolitan_area['name'] = names_list[0]['city'] metropolitan_area['_id'] = metropolitan_area['name']
def export(): categories = _business_categories_get() business = {} csv_fields = [ '_id', 'area_id', 'ratio', 'name', 'review_count', 'stars', 'categories' ] + categories for area in mongo_functions.mongo_get(collection='pre_metropolitan_area'): for category in categories: for order in [1, -1]: _business = _business_sorted_get(filter={ 'tile10': { '$in': area['tiles'] }, 'norm_categories': category, 'raw_ratio': { '$exists': True } }, sort=('raw_ratio', order), fields={ 'categories': 1, 'name': 1, 'review_count': 1, 'stars': 1, 'norm_categories': 1, 'raw_ratio': 1 }) for venue in _business: if venue['_id'] in business: continue venue['area_id'] = area['_id'] venue['categories'] = ','.join(venue['categories']) venue['ratio'] = venue['raw_ratio'] for _cat in categories: venue[_cat] = '' if _cat in venue['norm_categories']: venue[_cat] = '1' csv_values = [] for field in csv_fields: csv_values.append(str(venue[field])) business.update({venue['_id']: csv_values}) lines = [';'.join(line) for line in business.values()] content = ';'.join(csv_fields) + '\n' + '\n'.join(lines) write_csv_file('business_category_ratio.csv', content) print('yey')
def prepare(): user_reviews_dict = user_reviews_dict_create() _users = mongo_functions.mongo_get(collection='user') users = [] index = 0 users_len = len(_users) print('\n') while len(_users) > 0: index += 1 sys.stdout.write('\rProcessing user {}/{}...'.format(index, users_len)) sys.stdout.flush() _user = _users.pop() if _user['_id'] in user_reviews_dict: _user.update({'reviews': user_reviews_dict[_user['_id']]}) _user_reviews_prepare(_user) users.append(_user) mongo_functions.batch_upsert(users, collection='pre_user', update='{"$set": item}')
def _reviews_per_year(area): reviews = mongo_functions.mongo_get(collection='pre_review', filter={ 'city_area': area['_id'], 'user_from': { '$exists': True } }, fields={ 'business_id': 1, 'user_id': 1, 'tile15': 1, 'tile18': 1, 'user_from': 1, 'date': 1 }) reviews_per_year_dict = {} for review in reviews: year = datetime.datetime.strptime(review['date'], '%Y-%m-%d').year if year not in reviews_per_year_dict: reviews_per_year_dict[year] = [] reviews_per_year_dict[year].append(review) return reviews_per_year_dict
def business_count(tiles): return len(mongo_functions.mongo_get(collection='pre_business', filter={'tile10': {'$in': tiles}}, fields={}))