def scrape_wiki_list(url): webdriver = _get_webdriver() webdriver.get(url) table_xpath = '//table[@class="wikitable sortable jquery-tablesorter"]' WebDriverWait(webdriver, 10).until( EC.presence_of_element_located((By.XPATH, table_xpath))) rows = webdriver.find_elements_by_xpath(table_xpath + '/tbody/tr') neighborhood_table = pd.read_csv(os.getcwd() + '/NYC_neighborhood.csv', keep_default_na=False) # add borough name to neighborhood name and return as a flat list neighborhood_table = neighborhood_table.apply(lambda x: x + ',' + x.name) neighborhood_list = neighborhood_table.values.flatten() neighborhood_list = list( filter(lambda x: not x.startswith(','), neighborhood_list)) neighborhood_list = [ neighborhood.replace(' ', '+') for neighborhood in neighborhood_list ] # convert neighborhood_list to neighborhood_coordinates and update database db = Database() update_sql = """INSERT INTO coordinates (neighborhood, lat, lng) VALUES (%s, %s, %s)""" for neighborhood in neighborhood_list: # print(neighborhood) lat, lng = get_corrdinates_from_name(neighborhood) db.insert_row(update_sql, *(neighborhood, lat, lng))
def get_neighborhoods_dataframe() -> pd.DataFrame(): '''List of NYC neighborhoods to search in''' db = Database() neighborhoods_sql = '''SELECT * FROM coordinates''' df = db.select_df(neighborhoods_sql) df['neighborhood'] = df['neighborhood'].str.replace('+', ' ').str.replace( ',', ', ') return df
def _get_coordinates_list(): db = Database() # fetch the list of NYC neighborhoods' coordinates for the search get_lat_lng = '''SELECT neighborhood, lat, lng FROM coordinates''' results = db.select_rows(get_lat_lng) coordinates_list = [coord for coord in results] coordinates_list = [[ coord[0].replace('+', ' '), ','.join([str(coord[1]), str(coord[2])]) ] for coord in results] return coordinates_list
def fetch_data() -> pd.DataFrame(): db = Database() # get halal-reviews and its restaurant data data_sql = '''SELECT b.platform_id, b.name as restaurant_name, r.review_text, r.username, r.rating, concat(review_date,date) as review_date, r.helpful_count, b.address, b.image_url, b.lat, b.lng, b.total_review_count, b.total_halal_review_count FROM reviews r JOIN businesses b ON r.restaurant_id = b.platform_id WHERE r.review_text IS NOT NULL ''' data_df = db.select_df(data_sql) return data_df
def _get_unscraped_urls(): db = Database() # get list of google and yelp urls get_urls = '''SELECT url, platform_id FROM businesses WHERE url LIKE %s ''' yelp_urls = db.select_rows(get_urls, ('%yelp%', )) google_urls = db.select_rows(get_urls, ('%google%', )) # exclude businesses that have already been scraped scraped_ids = db.select_rows( '''SELECT DISTINCT ON (restaurant_id) restaurant_id FROM reviews''') exclusion_list = [item[0] for item in scraped_ids] yelp_urls_keep = [t for t in yelp_urls if t[1] not in exclusion_list] google_urls_keep = [t for t in google_urls if t[1] not in exclusion_list] return yelp_urls_keep, google_urls_keep
def _update_reviews(reviews_list): ''' get a nested list of individual review data. Each entry includes: - restaurant id, username, rating, review text, review date, helpful count Restaurants without reviews get an empty entry with platform_id only ''' db = Database() # update database with reviews scraping results reviews_sql = """INSERT INTO reviews (restaurant_id, username, rating, review_text, date, helpful_count) VALUES (%s, %s, %s, %s, %s, %s ) ON CONFLICT (review_text) DO NOTHING""" db_list = [item for sublist in reviews_list for item in sublist] for review in db_list: db.insert_row(reviews_sql, *review) # db.insert_rows(reviews_sql, *db_list) #print summary statement timestamped_print('Attempted to insert {} reviews'.format(len(db_list)))
def get_restaurant_dataframe(sort_by='') -> pd.DataFrame(): """A slice of yelp businesses dataframe for testing purposes""" db = Database() yelp_sql = '''SELECT * FROM businesses WHERE url LIKE %s ''' data = db.select_rows(yelp_sql, ('%yelp%', )) # for testing df = pd.DataFrame(data)[:20] df.columns = [ 'name', 'platform_id', 'url', 'total_review_count', 'address', 'id' ] df.address = df.address.map( lambda address: re.sub(r'[^A-Za-z0-9, ]+', '', address).split(',')) df.address = df.address.map( lambda address: ', '.join([str.strip() for str in address])) df['score'] = np.random.randint(1, 6, df.shape[0]) df['image_url'] = 'https://s3-media0.fl.yelpcdn.com/bphoto/h92NeXrAhC_SCM-Fa77J5A/258s.jpg' if sort_by != 'Halal Score': df.sort_values('total_review_count', inplace=True) return df
def custom_csv_to_db(): neighborhood_table = pd.read_csv(os.getcwd() + '/NYC_neighborhood.csv', keep_default_na=False) # add borough name to neighborhood name and return as a flat list neighborhood_table = neighborhood_table.apply(lambda x: x + ',' + x.name) neighborhood_list = neighborhood_table.values.flatten() neighborhood_list = list( filter(lambda x: not x.startswith(','), neighborhood_list)) neighborhood_list = [ neighborhood.replace(' ', '+') for neighborhood in neighborhood_list ] # convert neighborhood_list to neighborhood_coordinates and update database db = Database() update_sql = """INSERT INTO coordinates (neighborhood, lat, lng) VALUES (%s, %s, %s)""" for neighborhood in neighborhood_list: # print(neighborhood) lat, lng = get_corrdinates_from_name(neighborhood) db.insert_row(update_sql, *(neighborhood, lat, lng))
def _update_businesses(businesses_list): db = Database() # update database with API search results business_sql = """INSERT INTO businesses (name, platform_id, url, total_review_count, address, image_url, lat, lng) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (url) DO NOTHING""" db_list = [item for sublist in businesses_list for item in sublist] db.insert_rows(business_sql, *db_list) # final print with total number of businesses in the database at the end count_sql = '''SELECT count(*) FROM businesses WHERE url LIKE %s ''' google_count = db.select_rows(count_sql, ('%google%', ))[0][0] yelp_count = db.select_rows(count_sql, ('%yelp%'))[0][0] timestamped_print('{} total businesses found'.format(len(db_list))) timestamped_print( 'Have {0} google businesses and {1} yelp businesses'.format( google_count, yelp_count))
def confirm_neighborhoods(): db = Database() neighborhoods = db.select_df('''SELECT * FROM coordinates''') neighborhoods['reversed_name'] = neighborhoods.apply( lambda row: get_name_from_coordinates(row.lat, row.lng), axis=1)
##3 Libraries import random # add /src/data/data_collection to sys.path to import custom scripts import sys, os sys.path.append(os.getcwd() + '/src/data/data_collection/') from storage_managers.database import Database import Yelp_business_search import Google_business_search ### sample location db = Database() # fetch the list of NYC neighborhoods' coordinates get_lat_lng = '''SELECT neighborhood, lat, lng FROM coordinates''' results = db.select_rows(get_lat_lng) coordinates_list = [coord for coord in results] coordinates_list = [[ coord[0].replace('+', ' '), ','.join([str(coord[1]), str(coord[2])]) ] for coord in results] sample_coord = random.sample(coordinates_list, 1) coordinates = sample_coord[0] # Yelp_business_search.get_yelp_places_by_location('OzonePark', 'restaurant', 'Halal') # # test google restaurant search around a location # biz_list = Google_business_search.get_google_places_by_location(coordinates=coordinates) # print('Found {} businesses in {}'.format(len(biz_list), coordinates[0]))
sys.path.append(module_path) # now that the folder is in the path, ../data_collection/database.py can be imported from storage_managers.database import Database ### setup logging file ### # log into a text file sys.stdout = open( '/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/feature_engineering/Feature_Engineering_v5_log.txt', "w") ################################################################################################### ## import review, restaurant, and target data db = Database() # get halal-reviews (reviews that include the word 'halal') reviews_sql = '''SELECT * FROM reviews''' reviews_df = db.select_df(reviews_sql) print('- {} reviews containing the word halal were scraped'.format( reviews_df.shape[0])) # get target restaurants-of-interest list file_path = '/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/target_feature/label_target.csv' target_df = pd.read_csv(file_path, index_col=0) target_df['halal'] = target_df['halal'].str.replace('FLASE', 'FALSE') target_df['halal'] = target_df['halal'].apply(lambda x: True if x == 'TRUE' else False) halal_frac = target_df['halal'].sum() / target_df.shape[0] print('- {:.0f}% of the {} restaurants-of-interest are halal'.format(