def extract_full(url_params=url_params,limit=LIMIT): url_params['offset']=0 print "Extract in full mode with parameters", url_params extracttime = str(datetime.datetime.now())[0:19] first_extract = search_parameter(url_params) data=json.dumps(first_extract) data=json.loads(data) total = data['total'] zipcode='Unknown' if 'zipcode' in url_params: zipcode=url_params['zipcode'] log_para_list = [{'extracttime':extracttime, 'total':total, 'location':url_params['location'], 'zipcode':zipcode}] mysqldao.insert(db_name,tb_log_zipcode,log_para_list) columns_list = mysqldao.column_names(db_name, tb_yelp_zipcode) num_extract = total / limit +1 data_entry_list = [] print "Total:", str(total), "extract until 1000 data" count=0 for i in range(0, num_extract): #print "----No."+str(i+1)+" extraction" url_params['offset'] = i*OFFSET if i==49: break else: biz_list_i = search_parameter(url_params)[BUSINESS] tran_list = json_transform(biz_list_i,columns_list) count+=load_data_db(db_name, tb_yelp_zipcode, tran_list) print "Insert",count,"in the last run"
def extract_full(url_params=url_params, limit=LIMIT): url_params['offset'] = 0 print "Extract in full mode with parameters", url_params extracttime = str(datetime.datetime.now())[0:19] first_extract = search_parameter(url_params) data = json.dumps(first_extract) data = json.loads(data) total = data['total'] zipcode = 'Unknown' if 'zipcode' in url_params: zipcode = url_params['zipcode'] log_para_list = [{ 'extracttime': extracttime, 'total': total, 'location': url_params['location'], 'zipcode': zipcode }] mysqldao.insert(db_name, tb_log_zipcode, log_para_list) columns_list = mysqldao.column_names(db_name, tb_yelp_zipcode) num_extract = total / limit + 1 data_entry_list = [] print "Total:", str(total), "extract until 1000 data" count = 0 for i in range(0, num_extract): #print "----No."+str(i+1)+" extraction" url_params['offset'] = i * OFFSET if i == 49: break else: biz_list_i = search_parameter(url_params)[BUSINESS] tran_list = json_transform(biz_list_i, columns_list) count += load_data_db(db_name, tb_yelp_zipcode, tran_list) print "Insert", count, "in the last run"
def extract_by_phone(phone_list=None): print "====Begin extraction by phone." columns_list = mysqldao.column_names(db_name, tb_yelp_phone) biz_list = [] if phone_list == None: phone_list = [] temp_phone_list = mysqldao.select(db_name, tb_insp_norm, ['PHONE']) for p in temp_phone_list: phone_list.append(p[0]) exist_phone_list = [] exist_phone_tuple = mysqldao.select_unique_column(db_name, tb_log_phone, 'phone') count = 0 for phone in phone_list: biz_data = {} biz_list = [] phone = str(phone).replace(' ', '').replace('_', '') if len(phone) == 11: phone = phone[1:] if phone not in exist_phone_tuple: url_params = {"phone": phone, 'ywsid': 'bxtstnNlHgO8c6W4X2yuYA'} biz_data = yelp_api.request(API_HOST, '/phone_search', url_params=url_params)[BUSINESS] if len(biz_data) != 0: data_phone = dic_look_up(biz_data[0], 'phone') if data_phone == phone: biz_list.append(biz_data[0]) print "Phone:", phone, "count", len(biz_list), 'data' tran_list = json_transform_phone(biz_list, columns_list) load_data_db(db_name, tb_yelp_phone, tran_list) extracttime = str(datetime.datetime.now())[0:19] log_para_list = [{'extracttime': extracttime, 'phone': phone}] mysqldao.insert(db_name, tb_log_phone, log_para_list)
def extract_by_phone(phone_list=None): print "====Begin extraction by phone." columns_list = mysqldao.column_names(db_name, tb_yelp_phone) biz_list=[] if phone_list==None: phone_list=[] temp_phone_list=mysqldao.select(db_name, tb_insp_norm, ['PHONE']) for p in temp_phone_list: phone_list.append(p[0]) exist_phone_list=[] exist_phone_tuple = mysqldao.select_unique_column(db_name,tb_log_phone,'phone') count = 0 for phone in phone_list: biz_data={} biz_list=[] phone=str(phone).replace(' ','').replace('_','') if len(phone) ==11: phone = phone[1:] if phone not in exist_phone_tuple: url_params={ "phone":phone, 'ywsid':'bxtstnNlHgO8c6W4X2yuYA' } biz_data = yelp_api.request(API_HOST, '/phone_search', url_params=url_params)[BUSINESS] if len(biz_data) != 0: data_phone=dic_look_up(biz_data[0],'phone') if data_phone == phone: biz_list.append(biz_data[0]) print "Phone:",phone,"count",len(biz_list),'data' tran_list=json_transform_phone(biz_list,columns_list) load_data_db(db_name, tb_yelp_phone, tran_list) extracttime = str(datetime.datetime.now())[0:19] log_para_list = [{'extracttime':extracttime, 'phone':phone}] mysqldao.insert(db_name,tb_log_phone,log_para_list)
def get_review_html(bizid, start): url = BIZ_URL + bizid + PAGE_OFFSET + str(start) print url response = requests.get(url).text.encode("utf-8") linelist = str(response).split("\n") index = yelp_web.target_line_range(linelist, yelp_web.beginReviewReg, yelp_web.endReviewReg) reviewlist = linelist[index[0] : index[1]] review_dict_list = yelp_web.review_match(reviewlist, bizid) extracted = len(review_dict_list) if extracted > 0: mysqldao.insert(dbname, tb_yelp_review, review_dict_list) return extracted
def get_review_html(bizid, start): url = BIZ_URL + bizid + PAGE_OFFSET + str(start) print url response = requests.get(url).text.encode('utf-8') linelist = str(response).split("\n") index = yelp_web.target_line_range(linelist, yelp_web.beginReviewReg, yelp_web.endReviewReg) reviewlist = linelist[index[0]:index[1]] review_dict_list = yelp_web.review_match(reviewlist, bizid) extracted = len(review_dict_list) if extracted > 0: mysqldao.insert(dbname, tb_yelp_review, review_dict_list) return extracted
def load_data_db(dbname, tbname, data_entry_list): id_key = 'id' insert_list = [] update_list = [] exist_id_tuple = mysqldao.select_unique_column(dbname, tbname, id_key) for data in data_entry_list: if data[id_key] in exist_id_tuple: update_list.append(data) else: insert_list.append(data) #print 'Update exist '+str(len(update_list))+" data." #mysqldao.update(dbname, tbname, '', ['id'], update_list) #print 'Insert new '+str(len(insert_list))+" data." mysqldao.insert(dbname, tbname, insert_list) return len(insert_list)
def load_data_db(dbname, tbname, data_entry_list): id_key='id' insert_list=[] update_list=[] exist_id_tuple=mysqldao.select_unique_column(dbname, tbname, id_key) for data in data_entry_list: if data[id_key] in exist_id_tuple: update_list.append(data) else: insert_list.append(data) #print 'Update exist '+str(len(update_list))+" data." #mysqldao.update(dbname, tbname, '', ['id'], update_list) #print 'Insert new '+str(len(insert_list))+" data." mysqldao.insert(dbname,tbname,insert_list) return len(insert_list)
dbname='dwdproject' tb_yelp_user='******' USER_URL='http://www.yelp.com/user_details?userid=' target_user_list_query=""" SELECT userid from dwdproject.log_user WHERE userid not in (select userid from dwdproject.yelp_user) """ target_user_list_query2="""SELECT userid, count(userid) num from dwdproject.yelp_review WHERE userid not in (select userid from dwdproject.yelp_user) group by userid ORDER by num desc """ #target_user='******' print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" target_user_list=mysqldao.execute_query(target_user_list_query) for user in target_user_list: userid = user[0] url=USER_URL+str(userid) response=requests.get(url).text.encode('utf-8') linelist=str(response).split("\n") index=yelp_web.target_line_range(linelist, yelp_web.beginUserAttrReg, yelp_web.endUserAttrReg) attr_dict=yelp_web.user_attribute(linelist[index[0]:index[1]]) attr_dict['userid']=userid mysqldao.insert(dbname,tb_yelp_user,[attr_dict]) print "=======Insert user", userid