def extract_by_phone(phone_list=None): print "====Begin extraction by phone." columns_list = mysqldao.column_names(db_name, tb_yelp_phone) biz_list = [] if phone_list == None: phone_list = [] temp_phone_list = mysqldao.select(db_name, tb_insp_norm, ['PHONE']) for p in temp_phone_list: phone_list.append(p[0]) exist_phone_list = [] exist_phone_tuple = mysqldao.select_unique_column(db_name, tb_log_phone, 'phone') count = 0 for phone in phone_list: biz_data = {} biz_list = [] phone = str(phone).replace(' ', '').replace('_', '') if len(phone) == 11: phone = phone[1:] if phone not in exist_phone_tuple: url_params = {"phone": phone, 'ywsid': 'bxtstnNlHgO8c6W4X2yuYA'} biz_data = yelp_api.request(API_HOST, '/phone_search', url_params=url_params)[BUSINESS] if len(biz_data) != 0: data_phone = dic_look_up(biz_data[0], 'phone') if data_phone == phone: biz_list.append(biz_data[0]) print "Phone:", phone, "count", len(biz_list), 'data' tran_list = json_transform_phone(biz_list, columns_list) load_data_db(db_name, tb_yelp_phone, tran_list) extracttime = str(datetime.datetime.now())[0:19] log_para_list = [{'extracttime': extracttime, 'phone': phone}] mysqldao.insert(db_name, tb_log_phone, log_para_list)
def extract_by_phone(phone_list=None): print "====Begin extraction by phone." columns_list = mysqldao.column_names(db_name, tb_yelp_phone) biz_list=[] if phone_list==None: phone_list=[] temp_phone_list=mysqldao.select(db_name, tb_insp_norm, ['PHONE']) for p in temp_phone_list: phone_list.append(p[0]) exist_phone_list=[] exist_phone_tuple = mysqldao.select_unique_column(db_name,tb_log_phone,'phone') count = 0 for phone in phone_list: biz_data={} biz_list=[] phone=str(phone).replace(' ','').replace('_','') if len(phone) ==11: phone = phone[1:] if phone not in exist_phone_tuple: url_params={ "phone":phone, 'ywsid':'bxtstnNlHgO8c6W4X2yuYA' } biz_data = yelp_api.request(API_HOST, '/phone_search', url_params=url_params)[BUSINESS] if len(biz_data) != 0: data_phone=dic_look_up(biz_data[0],'phone') if data_phone == phone: biz_list.append(biz_data[0]) print "Phone:",phone,"count",len(biz_list),'data' tran_list=json_transform_phone(biz_list,columns_list) load_data_db(db_name, tb_yelp_phone, tran_list) extracttime = str(datetime.datetime.now())[0:19] log_para_list = [{'extracttime':extracttime, 'phone':phone}] mysqldao.insert(db_name,tb_log_phone,log_para_list)
def extract_by_zipcode(url_params=url_params): print "====Begin extraction by zipcode." zip_tuple_all=mysqldao.select(db_name,tb_zipcode,['zipcode']) exist_zipcode_tuple = mysqldao.select_unique_column(db_name,tb_log_zipcode,'zipcode') for code in zip_tuple_all: zipcode = str(code[0]) url_params_copy=url_params if zipcode not in exist_zipcode_tuple: url_params_copy['location']="New York, "+str(zipcode)+", NY" url_params_copy['zipcode']=str(zipcode) extract_full(url_params_copy) else: print zipcode, "existed."
def extract_by_zipcode(url_params=url_params): print "====Begin extraction by zipcode." zip_tuple_all = mysqldao.select(db_name, tb_zipcode, ['zipcode']) exist_zipcode_tuple = mysqldao.select_unique_column( db_name, tb_log_zipcode, 'zipcode') for code in zip_tuple_all: zipcode = str(code[0]) url_params_copy = url_params if zipcode not in exist_zipcode_tuple: url_params_copy['location'] = "New York, " + str(zipcode) + ", NY" url_params_copy['zipcode'] = str(zipcode) extract_full(url_params_copy) else: print zipcode, "existed."
import mysqldao import urllib import time address = """ REPLACE( CONCAT(address1, ' ',address2, ' ', city, ' ', state,' ', zip), ' ', ' ') address""" yelp_query = mysqldao.select('yelp', 'yelp_phone', ['phone', address, 'zip'], ['longitude', 'latitude'], [{'longitude': 0, 'latitude': 0}]) # address = urllib.quote(yelp_query[1][1]) print len(yelp_query) import json import requests address = [ [query[0], query[1]] for query in yelp_query] entries = [] del address[0] little = [] # little.append(address[1]) # little.append(address[2]) # little.append(address[3]) import MySQLdb as mdb import sys con = mdb.connect(host = 'localhost', user = '******', passwd = '', charset='utf8');
import mysqldao import urllib import json import requests address = """ REPLACE( CONCAT(address1, ' ',address2, ' ', city, ' ', state,' ', zip), ' ', ' ') address""" yelp_query = mysqldao.select('dwdproject', 'yelp_phone', ['phone', address, 'zip']) address = [ [query[0], query[1]] for query in yelp_query ] urls = [] for adr in address: url = 'http://maps.googleapis.com/maps/api/geocode/json?address="%s"&sensor=true' % urllib.quote(adr[1]) urls.append(url) resp = requests.get(urls[1]) data = json.loads(resp.text) geo = data["results"][0]["geometry"]["location"] data_entry = {'longitude': geo["lng"], 'latitude': geo["lat"], 'phone': address[1][0]} mysqldao.update('dwdproject', 'yelp_phone', ['longitude', 'latitude'], ['phone'], [data_entry]) print mysqldao.select('dwdproject', 'yelp_phone', ['*'], ['phone'],[{'phone': '7188924968' }])
import mysqldao import urllib address = """ REPLACE( CONCAT(address1, ' ',address2, ' ', city, ' ', state,' ', zip), ' ', ' ') address""" yelp_query = mysqldao.select('yelp', 'yelp_phone', ['phone', address, 'zip']) # address = urllib.quote(yelp_query[1][1]) yelp_query import json import requests address = [ [query[0], query[1]] for query in yelp_query ] urls = [] for adr in address: url = 'http://maps.googleapis.com/maps/api/geocode/json?address="%s"&sensor=true' % urllib.quote(adr[1]) urls.append(url) resp = requests.get(urls[1]) data = json.loads(resp.text) geo = data["results"][0]["geometry"]["location"] data_entry = {'longitude': geo["lng"], 'latitude': geo["lat"], 'phone': address[1][0]} data_entry
import mysqldao import urllib import json import requests address = """ REPLACE( CONCAT(address1, ' ',address2, ' ', city, ' ', state,' ', zip), ' ', ' ') address""" yelp_query = mysqldao.select('dwdproject', 'yelp_phone', ['phone', address, 'zip']) address = [[query[0], query[1]] for query in yelp_query] urls = [] for adr in address: url = 'http://maps.googleapis.com/maps/api/geocode/json?address="%s"&sensor=true' % urllib.quote( adr[1]) urls.append(url) resp = requests.get(urls[1]) data = json.loads(resp.text) geo = data["results"][0]["geometry"]["location"] data_entry = { 'longitude': geo["lng"], 'latitude': geo["lat"], 'phone': address[1][0] } mysqldao.update('dwdproject', 'yelp_phone', ['longitude', 'latitude'],
import mysqldao import urllib address = """ REPLACE( CONCAT(address1, ' ',address2, ' ', city, ' ', state,' ', zip), ' ', ' ') address""" yelp_query = mysqldao.select('yelp', 'yelp_phone', ['phone', address, 'zip']) # address = urllib.quote(yelp_query[1][1]) yelp_query import json import requests address = [[query[0], query[1]] for query in yelp_query] urls = [] for adr in address: url = 'http://maps.googleapis.com/maps/api/geocode/json?address="%s"&sensor=true' % urllib.quote( adr[1]) urls.append(url) resp = requests.get(urls[1]) data = json.loads(resp.text) geo = data["results"][0]["geometry"]["location"] data_entry = { 'longitude': geo["lng"], 'latitude': geo["lat"],
import yelp_web import yelp_etl import mysqldao import requests BIZ_URL = "http://www.yelp.com/biz/" keyword = 'price_range' all_biz_id = mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant, ['id']) list_biz_id = mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant, ['id'], [keyword], [{ keyword: '' }]) left_biz = len(list_biz_id) total_biz = len(all_biz_id) count = 0 for biz in list_biz_id: bizid = biz[0] url = BIZ_URL + bizid response = requests.get(url).text.encode('utf-8') linelist = str(response).split("\n") index = yelp_web.target_line_range(linelist, yelp_web.beginAttributeReg, yelp_web.endAttributeReg) attr_dict = yelp_web.attribute_match(linelist[index[0]:index[1]]) attr_dict['id'] = bizid mysqldao.update(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant,\ yelp_web.list_attribute, ['id'], [attr_dict]) count += 1 print bizid, 'updated', "left ", left_biz - count
import yelp_web import yelp_etl import mysqldao import requests BIZ_URL="http://www.yelp.com/biz/" keyword='price_range' all_biz_id=mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant, ['id']) list_biz_id=mysqldao.select(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant, ['id'], [keyword], [{keyword:''}]) left_biz=len(list_biz_id) total_biz=len(all_biz_id) count = 0 for biz in list_biz_id: bizid=biz[0] url=BIZ_URL+bizid response=requests.get(url).text.encode('utf-8') linelist=str(response).split("\n") index=yelp_web.target_line_range(linelist, yelp_web.beginAttributeReg, yelp_web.endAttributeReg) attr_dict=yelp_web.attribute_match(linelist[index[0]:index[1]]) attr_dict['id']=bizid mysqldao.update(yelp_etl.db_name, yelp_etl.tb_yelp_restaurant,\ yelp_web.list_attribute, ['id'], [attr_dict]) count+=1 print bizid, 'updated', "left ", left_biz-count
import mysqldao import urllib import time address = """ REPLACE( CONCAT(address1, ' ',address2, ' ', city, ' ', state,' ', zip), ' ', ' ') address""" yelp_query = mysqldao.select('yelp', 'yelp_phone', ['phone', address, 'zip'], ['longitude', 'latitude'], [{ 'longitude': 0, 'latitude': 0 }]) # address = urllib.quote(yelp_query[1][1]) print len(yelp_query) import json import requests address = [[query[0], query[1]] for query in yelp_query] entries = [] del address[0] little = [] # little.append(address[1]) # little.append(address[2]) # little.append(address[3])