예제 #1
0
def scrape_wiki_list(url):
    webdriver = _get_webdriver()
    webdriver.get(url)
    table_xpath = '//table[@class="wikitable sortable jquery-tablesorter"]'
    WebDriverWait(webdriver, 10).until(
        EC.presence_of_element_located((By.XPATH, table_xpath)))
    rows = webdriver.find_elements_by_xpath(table_xpath + '/tbody/tr')
    neighborhood_table = pd.read_csv(os.getcwd() + '/NYC_neighborhood.csv',
                                     keep_default_na=False)

    # add borough name to neighborhood name and return as a flat list
    neighborhood_table = neighborhood_table.apply(lambda x: x + ',' + x.name)
    neighborhood_list = neighborhood_table.values.flatten()
    neighborhood_list = list(
        filter(lambda x: not x.startswith(','), neighborhood_list))
    neighborhood_list = [
        neighborhood.replace(' ', '+') for neighborhood in neighborhood_list
    ]

    # convert neighborhood_list to neighborhood_coordinates and update database
    db = Database()
    update_sql = """INSERT INTO coordinates (neighborhood, lat, lng)
                    VALUES (%s, %s, %s)"""
    for neighborhood in neighborhood_list:
        # print(neighborhood)
        lat, lng = get_corrdinates_from_name(neighborhood)
        db.insert_row(update_sql, *(neighborhood, lat, lng))
예제 #2
0
def get_neighborhoods_dataframe() -> pd.DataFrame():
    '''List of NYC neighborhoods to search in'''
    db = Database()
    neighborhoods_sql = '''SELECT *
                            FROM coordinates'''
    df = db.select_df(neighborhoods_sql)
    df['neighborhood'] = df['neighborhood'].str.replace('+', ' ').str.replace(
        ',', ', ')
    return df
예제 #3
0
def _get_coordinates_list():
    db = Database()
    # fetch the list of NYC neighborhoods' coordinates for the search
    get_lat_lng = '''SELECT neighborhood, lat, lng
                        FROM coordinates'''
    results = db.select_rows(get_lat_lng)
    coordinates_list = [coord for coord in results]
    coordinates_list = [[
        coord[0].replace('+', ' '), ','.join([str(coord[1]),
                                              str(coord[2])])
    ] for coord in results]
    return coordinates_list
예제 #4
0
def fetch_data() -> pd.DataFrame():
    db = Database()
    # get halal-reviews and its restaurant data
    data_sql = '''SELECT b.platform_id, b.name as restaurant_name, r.review_text, r.username, r.rating,
                concat(review_date,date) as review_date, r.helpful_count, b.address, b.image_url,
                b.lat, b.lng, b.total_review_count, b.total_halal_review_count
                FROM reviews r
                JOIN businesses b
                ON r.restaurant_id = b.platform_id
                WHERE r.review_text IS NOT NULL '''
    data_df = db.select_df(data_sql)
    return data_df
예제 #5
0
def _get_unscraped_urls():
    db = Database()
    # get list of google and yelp urls
    get_urls = '''SELECT url, platform_id
                    FROM businesses
                    WHERE url LIKE %s '''
    yelp_urls = db.select_rows(get_urls, ('%yelp%', ))
    google_urls = db.select_rows(get_urls, ('%google%', ))

    # exclude businesses that have already been scraped
    scraped_ids = db.select_rows(
        '''SELECT DISTINCT ON (restaurant_id) restaurant_id FROM reviews''')
    exclusion_list = [item[0] for item in scraped_ids]
    yelp_urls_keep = [t for t in yelp_urls if t[1] not in exclusion_list]
    google_urls_keep = [t for t in google_urls if t[1] not in exclusion_list]
    return yelp_urls_keep, google_urls_keep
예제 #6
0
def _update_reviews(reviews_list):
    '''
        get a nested list of individual review data. Each entry includes:
        - restaurant id, username, rating, review text, review date, helpful count
        Restaurants without reviews get an empty entry with platform_id only
    '''
    db = Database()
    # update database with reviews scraping results
    reviews_sql = """INSERT INTO reviews (restaurant_id, username, rating, review_text, date, helpful_count)
                    VALUES (%s, %s, %s, %s, %s, %s )
                    ON CONFLICT (review_text) DO NOTHING"""
    db_list = [item for sublist in reviews_list for item in sublist]
    for review in db_list:
        db.insert_row(reviews_sql, *review)

    # db.insert_rows(reviews_sql, *db_list)

    #print summary statement
    timestamped_print('Attempted to insert {} reviews'.format(len(db_list)))
예제 #7
0
def get_restaurant_dataframe(sort_by='') -> pd.DataFrame():
    """A slice of yelp businesses dataframe for testing purposes"""
    db = Database()
    yelp_sql = '''SELECT *
                    FROM businesses
                    WHERE url LIKE %s '''
    data = db.select_rows(yelp_sql, ('%yelp%', ))
    # for testing
    df = pd.DataFrame(data)[:20]
    df.columns = [
        'name', 'platform_id', 'url', 'total_review_count', 'address', 'id'
    ]
    df.address = df.address.map(
        lambda address: re.sub(r'[^A-Za-z0-9, ]+', '', address).split(','))
    df.address = df.address.map(
        lambda address: ', '.join([str.strip() for str in address]))
    df['score'] = np.random.randint(1, 6, df.shape[0])
    df['image_url'] = 'https://s3-media0.fl.yelpcdn.com/bphoto/h92NeXrAhC_SCM-Fa77J5A/258s.jpg'
    if sort_by != 'Halal Score':
        df.sort_values('total_review_count', inplace=True)
    return df
예제 #8
0
def custom_csv_to_db():
    neighborhood_table = pd.read_csv(os.getcwd() + '/NYC_neighborhood.csv',
                                     keep_default_na=False)

    # add borough name to neighborhood name and return as a flat list
    neighborhood_table = neighborhood_table.apply(lambda x: x + ',' + x.name)
    neighborhood_list = neighborhood_table.values.flatten()
    neighborhood_list = list(
        filter(lambda x: not x.startswith(','), neighborhood_list))
    neighborhood_list = [
        neighborhood.replace(' ', '+') for neighborhood in neighborhood_list
    ]

    # convert neighborhood_list to neighborhood_coordinates and update database
    db = Database()
    update_sql = """INSERT INTO coordinates (neighborhood, lat, lng)
                    VALUES (%s, %s, %s)"""
    for neighborhood in neighborhood_list:
        # print(neighborhood)
        lat, lng = get_corrdinates_from_name(neighborhood)
        db.insert_row(update_sql, *(neighborhood, lat, lng))
예제 #9
0
def _update_businesses(businesses_list):
    db = Database()
    # update database with API search results
    business_sql = """INSERT INTO businesses (name, platform_id, url, total_review_count, address, image_url, lat, lng)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                    ON CONFLICT (url) DO NOTHING"""
    db_list = [item for sublist in businesses_list for item in sublist]
    db.insert_rows(business_sql, *db_list)

    # final print with total number of businesses in the database at the end
    count_sql = '''SELECT count(*)
                    FROM businesses
                    WHERE url LIKE %s '''
    google_count = db.select_rows(count_sql, ('%google%', ))[0][0]
    yelp_count = db.select_rows(count_sql, ('%yelp%'))[0][0]
    timestamped_print('{} total businesses found'.format(len(db_list)))
    timestamped_print(
        'Have {0} google businesses and {1} yelp businesses'.format(
            google_count, yelp_count))
예제 #10
0
def confirm_neighborhoods():
    db = Database()
    neighborhoods = db.select_df('''SELECT * FROM coordinates''')
    neighborhoods['reversed_name'] = neighborhoods.apply(
        lambda row: get_name_from_coordinates(row.lat, row.lng), axis=1)
예제 #11
0
##3 Libraries
import random

# add /src/data/data_collection to sys.path to import custom scripts
import sys, os

sys.path.append(os.getcwd() + '/src/data/data_collection/')

from storage_managers.database import Database

import Yelp_business_search
import Google_business_search

### sample location
db = Database()
# fetch the list of NYC neighborhoods' coordinates
get_lat_lng = '''SELECT neighborhood, lat, lng
                    FROM coordinates'''
results = db.select_rows(get_lat_lng)
coordinates_list = [coord for coord in results]
coordinates_list = [[
    coord[0].replace('+', ' '), ','.join([str(coord[1]),
                                          str(coord[2])])
] for coord in results]
sample_coord = random.sample(coordinates_list, 1)
coordinates = sample_coord[0]
# Yelp_business_search.get_yelp_places_by_location('OzonePark', 'restaurant', 'Halal')

# # test google restaurant search around a location
# biz_list = Google_business_search.get_google_places_by_location(coordinates=coordinates)
# print('Found {} businesses in {}'.format(len(biz_list), coordinates[0]))
    sys.path.append(module_path)

# now that the folder is in the path, ../data_collection/database.py can be imported
from storage_managers.database import Database

### setup logging file ###
# log into a text file
sys.stdout = open(
    '/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/feature_engineering/Feature_Engineering_v5_log.txt',
    "w")

###################################################################################################

## import review, restaurant, and target data

db = Database()
# get halal-reviews (reviews that include the word 'halal')
reviews_sql = '''SELECT * FROM reviews'''
reviews_df = db.select_df(reviews_sql)
print('- {} reviews containing the word halal were scraped'.format(
    reviews_df.shape[0]))

# get target restaurants-of-interest list
file_path = '/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/target_feature/label_target.csv'
target_df = pd.read_csv(file_path, index_col=0)
target_df['halal'] = target_df['halal'].str.replace('FLASE', 'FALSE')
target_df['halal'] = target_df['halal'].apply(lambda x: True
                                              if x == 'TRUE' else False)
halal_frac = target_df['halal'].sum() / target_df.shape[0]

print('- {:.0f}% of the {} restaurants-of-interest are halal'.format(