예제 #1
0
import ast

#Third party modules
from flask import Flask, render_template, request
import numpy as np
from pandas import DataFrame
import pandas
from sklearn import linear_model
from sklearn.externals import joblib
from pygeocoder import Geocoder

import helper
import sql_database
import predict_rest

db = sql_database.DbAccess('INSIGHT', usr='******')
app = Flask(__name__)
app.debug = True

@app.route("/")
def hello():
    return render_template('home.html')

@app.route("/maps")
def maps():
    restaurant = request.args.get("restaurant", "")
    miles = request.args.get("miles", "")
    zipcode = request.args.get("zipcode", "")
    return render_template('maps.html', restaurant=restaurant, miles=miles, zipcode=zipcode)

@app.route("/restaurant")
예제 #2
0
def main():
    db_mongo = client.yelp_database
    posts = db_mongo.posts

    attrs = [
        'Alcohol', 'HasTV', 'NoiseLevel', 'RestaurantsAttire',
        'BusinessAcceptsCreditCards', 'Ambience', 'RestaurantsGoodForGroups',
        'Caters', 'WiFi', 'RestaurantsReservations', 'RestaurantsTakeOut',
        'GoodForKids', 'WheelchairAccessible', 'RestaurantsTableService',
        'OutdoorSeating', 'RestaurantsPriceRange2', 'RestaurantsDelivery',
        'GoodForMeal', 'BusinessParking'
    ]

    db_sql = sql_database.DbAccess('INSIGHT', usr='******')
    #db_sql.cursor.execute('DROP TABLE IF EXISTS Restaurant;')

    #Columns = 'ID INTEGER, Name CHAR(80), Street CHAR(80), City CHAR(40), State CHAR(10), Zip CHAR(10), FullName CHAR(200) NOT NULL PRIMARY KEY, '
    #Columns += 'Phone CHAR(50), Site CHAR(100), PictureUrl CHAR(150), Rating FLOAT, Favorites CHAR(200)'
    #Columns += ', RestaurantType CHAR(200), Latitude FLOAT, Longitude FLOAT, SimilarRest1 CHAR(100), SimilarRest2 CHAR(100), SimilarRest3 CHAR(100), NReviews INT, Review LONGTEXT'

    #for attr in attrs:
    #Columns += ', ' + attr + ' CHAR(80)'

    #db_sql.cursor.execute('CREATE TABLE Restaurant (' + Columns + ');')

    count = 0
    rests_info = posts.find({"added_sql": False})

    for rest_info in rests_info:
        if count % 100 == 0:
            print count
        count += 1
        n_reviews = rest_info['reviews']

        if n_reviews < 40:
            continue

        if 'yelp_page' not in rest_info:
            continue

        page = rest_info['yelp_page']
        soup = BeautifulSoup(page)

        restaurant = get_restaurant(soup)

        divs = soup.find_all('div')

        new_info = {}

        bizRating = [div for div in divs if div.get("id") == "bizRating"]

        #Fails if page essentially has no info
        if len(bizRating) == 0:
            continue
        new_info["Rating"] = bizRating[0].meta['content']

        #Get name and address
        h1s = soup.find_all("h1")
        name = h1s[0].contents[0].strip()

        #Just skip restaurants with chinese characters in their name because I don't feel like dealing with the encoding right now
        if re.findall(ur'[\u4e00-\u9fff]+', name):
            continue
        spans = soup.find_all('span')

        telephone = get_address(spans, "telephone")
        street = get_address(spans, "streetAddress")
        city = get_address(spans, "addressLocality")
        state = get_address(spans, "addressRegion")
        zipcode = get_address(spans, "postalCode")
        full_name = name + ' ' + street + ' ' + city + ', ' + state + ' ' + zipcode

        picture_url = 'NULL'
        picture_div = [
            div for div in divs
            if div.get("class") and len(div.get("class")) > 1
            and div.get("class")[1] == "biz-photo-box"
        ]
        if len(picture_div) > 0:
            picture_url = picture_div[0].img['src']

        #Get ngrams from snippets
        review_snippets = [
            div for div in divs
            if div.get("class") and len(div.get("class")) > 1
            and div.get("class")[0] == "media-story"
            and div.get("class")[1] == "snippet"
        ]
        ngrams = []
        for snippet in review_snippets:
            for object in snippet.contents:
                try:
                    if object.get('ngram'):
                        ngrams.append(object.get('ngram'))
                except:
                    continue
        new_info["Ngrams"] = ngrams

        #Grab all the header data on the restaurant page
        dds = soup.find_all('dd')
        for dd in dds:
            content = dd.contents[0]
            #Convert price to a scale from 1 to 4
            if dd['class'][0].find("RestaurantsPriceRange2") != -1:
                content = 4 - len(dd.span.span['data-remainder'])
            new_info[str(dd['class'][0])[5:]] = str(content)

        #Get the type of the restaurant
        bizInfo = [div for div in divs if div.get("id") == "bizInfoContent"]
        bizInfo_spans = bizInfo[0].find_all('span')
        category = [
            span for span in bizInfo_spans if span.get('id') == "cat_display"
        ][0]
        restaurant_type = [
            content.contents[0].lstrip() for content in category.contents
            if hasattr(content, 'contents')
        ]
        new_info['restaurant_type'] = restaurant_type

        #Get similar restaurants
        rec_bizs = [div for div in divs if div.get("id") == "bizSimilarBox"]
        if len(rec_bizs) == 0:
            continue
        rec_links = [
            rec_biz['href'] for rec_biz in rec_bizs[0].find_all('a')
            if str(rec_biz.get("id")).find("bizreclink") != -1
        ]
        if len(rec_links) < 3:
            continue

        #Get latitude and longitude from map
        imgs = soup.find_all('img')
        if len([
                img['src'] for img in imgs
                if img.get("alt") == "Map of Business"
        ]) == 0:
            continue
        map = [
            img['src'] for img in imgs if img.get("alt") == "Map of Business"
        ][0]
        lat = map[map.find('center') + 7:map.find("%2C")]
        long = map[map.find('%2C') + 3:map.find("&language")]

        review = clean_review(get_reviews(soup)).encode('ascii',
                                                        errors='ignore')

        for i in range(1, 5):
            if ('yelp_page' + str(i)) in rest_info:
                page = rest_info['yelp_page' + str(i)]
                soup = BeautifulSoup(page)
                review += clean_review(get_reviews(soup)).encode(
                    'ascii', errors='ignore')

        #Now insert all this information into SQL
        Values = 'INSERT INTO Restaurant (ID, Name, Street, City, State, Zip, FullName, Phone, Site, PictureUrl, Rating, Favorites, RestaurantType'
        Values += ', Latitude, Longitude, SimilarRest1, SimilarRest2, SimilarRest3, NReviews, Review'
        for attr in attrs:
            Values += ', ' + attr

        Values += ') VALUES (' + str(
            count
        ) + ', "' + name + '", "' + street + '", "' + city + '", "' + state + '", "' + zipcode + '", "' + full_name + '", "'
        Values += telephone + '", "' + restaurant + '", "' + picture_url + '", '
        Values += bizRating[0].meta["content"] + ', "' + "---".join(
            ngrams) + '", "' + "---".join(restaurant_type) + '", '
        Values += lat.encode('utf-8') + ', ' + long.encode('utf-8') + ', "'
        Values += rec_links[0].encode('utf-8') + '", "' + rec_links[1].encode(
            'utf-8') + '", "' + rec_links[2].encode('utf-8') + '", '
        Values += str(n_reviews) + ', "' + review

        for attr in attrs:
            if attr not in new_info:
                new_info[attr] = "NULL"
            Values += '", "' + new_info[attr]

        Values += '");'
        print restaurant
        #Add restaurant to db. If restaurant `with name already in db use the one with more reviews
        try:
            db_sql.cursor.execute(Values)
        except mysql.connector.IntegrityError:
            print "Caught exception"
            sql = ('SELECT NReviews FROM Restaurant WHERE FullName = "' +
                   full_name + '";')
            db_sql.cursor.execute(sql)
            old_restaurant = db_sql.cursor.fetchall()
            if len(old_restaurant) > 0:
                old_nreviews = old_restaurant[0][0]
                if n_reviews > old_nreviews:
                    db_sql.cursor.execute(
                        'DELETE FROM Restaurant WHERE FullName = "' +
                        full_name + '";')
                    db_sql.commit()
                    db_sql.cursor.execute(Values)

        db_mongo.posts.update({'restaurant': restaurant},
                              {"$set": {
                                  "added_sql": True
                              }}, True)
        rest_info['added_sql'] = True

        db_sql.commit()
예제 #3
0
#Standard modules
from sets import Set

#Third party modules
from pandas import DataFrame
import numpy as np
import pandas
from sklearn import linear_model
from sklearn.externals import joblib
from sklearn import cross_validation

import helper
import sql_database
db = sql_database.DbAccess('YELP', usr='******')

def main():
    df_match = pandas.io.sql.read_frame('''
       SELECT r1.RestaurantType as r1Type, r2.RestaurantType as r2Type, ABS(r1.RestaurantsPriceRange2 - r2.RestaurantsPriceRange2) as PriceDiff,
       ABS(r1.Rating - r2.Rating) as RatingDiff, r1.GoodForMeal=r2.GoodForMeal as MealSame,
       r1.RestaurantsTableService=r2.RestaurantsTableService as TableSame, r1.Favorites as r1Food, r2.Favorites as r2Food
       FROM Restaurant r1 JOIN Restaurant r2 ON r1.Site = r2.SimilarRest1 or r1.Site = r2.SimilarRest2 or r1.Site = r2.SimilarRest3;''',db.cnx)
    df_match['Match'] = np.ones(len(df_match))

    df_nomatch = pandas.io.sql.read_frame('''
       SELECT r1.RestaurantType as r1Type, r2.RestaurantType as r2Type, ABS(r1.RestaurantsPriceRange2 - r2.RestaurantsPriceRange2) as PriceDiff,
       ABS(r1.Rating - r2.Rating) as RatingDiff, r1.GoodForMeal=r2.GoodForMeal as MealSame,
       r1.RestaurantsTableService=r2.RestaurantsTableService as TableSame, r1.Favorites as r1Food, r2.Favorites as r2Food
       FROM Restaurant r1 JOIN Restaurant r2 ON r1.Site != r2.SimilarRest1 AND r1.Site != r2.SimilarRest2 AND r1.Site != r2.SimilarRest3 AND r1.Site != r2.Site
       AND ABS(r1.Latitude - r2.Latitude) < 0.0007 and ABS(r1.Longitude - r2.Longitude) < 0.0007
       ;''',db.cnx)
    df_nomatch['Match'] = np.zeros(len(df_nomatch))