def extract_full(url_params=url_params,limit=LIMIT):
    url_params['offset']=0
    print "Extract in full mode with parameters", url_params
    extracttime = str(datetime.datetime.now())[0:19]
    first_extract = search_parameter(url_params)
    data=json.dumps(first_extract)
    data=json.loads(data)
    total = data['total']
    zipcode='Unknown'
    if 'zipcode' in url_params:
        zipcode=url_params['zipcode']
    log_para_list = [{'extracttime':extracttime, 'total':total, 'location':url_params['location'], 'zipcode':zipcode}]
    mysqldao.insert(db_name,tb_log_zipcode,log_para_list)
    columns_list = mysqldao.column_names(db_name, tb_yelp_zipcode)
    num_extract = total / limit +1
    data_entry_list = []
    print "Total:", str(total), "extract until 1000 data"
    count=0
    for i in range(0, num_extract):
        #print "----No."+str(i+1)+" extraction"
        url_params['offset'] = i*OFFSET
        if i==49:
            break
        else:
            biz_list_i = search_parameter(url_params)[BUSINESS]
            tran_list = json_transform(biz_list_i,columns_list)
            count+=load_data_db(db_name, tb_yelp_zipcode, tran_list)
    print "Insert",count,"in the last run"
예제 #2
0
def extract_full(url_params=url_params, limit=LIMIT):
    url_params['offset'] = 0
    print "Extract in full mode with parameters", url_params
    extracttime = str(datetime.datetime.now())[0:19]
    first_extract = search_parameter(url_params)
    data = json.dumps(first_extract)
    data = json.loads(data)
    total = data['total']
    zipcode = 'Unknown'
    if 'zipcode' in url_params:
        zipcode = url_params['zipcode']
    log_para_list = [{
        'extracttime': extracttime,
        'total': total,
        'location': url_params['location'],
        'zipcode': zipcode
    }]
    mysqldao.insert(db_name, tb_log_zipcode, log_para_list)
    columns_list = mysqldao.column_names(db_name, tb_yelp_zipcode)
    num_extract = total / limit + 1
    data_entry_list = []
    print "Total:", str(total), "extract until 1000 data"
    count = 0
    for i in range(0, num_extract):
        #print "----No."+str(i+1)+" extraction"
        url_params['offset'] = i * OFFSET
        if i == 49:
            break
        else:
            biz_list_i = search_parameter(url_params)[BUSINESS]
            tran_list = json_transform(biz_list_i, columns_list)
            count += load_data_db(db_name, tb_yelp_zipcode, tran_list)
    print "Insert", count, "in the last run"
예제 #3
0
def extract_by_phone(phone_list=None):
    print "====Begin extraction by phone."
    columns_list = mysqldao.column_names(db_name, tb_yelp_phone)
    biz_list = []
    if phone_list == None:
        phone_list = []
        temp_phone_list = mysqldao.select(db_name, tb_insp_norm, ['PHONE'])
        for p in temp_phone_list:
            phone_list.append(p[0])
    exist_phone_list = []
    exist_phone_tuple = mysqldao.select_unique_column(db_name, tb_log_phone,
                                                      'phone')
    count = 0
    for phone in phone_list:
        biz_data = {}
        biz_list = []
        phone = str(phone).replace(' ', '').replace('_', '')
        if len(phone) == 11:
            phone = phone[1:]
        if phone not in exist_phone_tuple:
            url_params = {"phone": phone, 'ywsid': 'bxtstnNlHgO8c6W4X2yuYA'}
            biz_data = yelp_api.request(API_HOST,
                                        '/phone_search',
                                        url_params=url_params)[BUSINESS]
            if len(biz_data) != 0:
                data_phone = dic_look_up(biz_data[0], 'phone')
                if data_phone == phone:
                    biz_list.append(biz_data[0])
            print "Phone:", phone, "count", len(biz_list), 'data'
            tran_list = json_transform_phone(biz_list, columns_list)
            load_data_db(db_name, tb_yelp_phone, tran_list)
            extracttime = str(datetime.datetime.now())[0:19]
            log_para_list = [{'extracttime': extracttime, 'phone': phone}]
            mysqldao.insert(db_name, tb_log_phone, log_para_list)
def extract_by_phone(phone_list=None):
    print "====Begin extraction by phone."
    columns_list = mysqldao.column_names(db_name, tb_yelp_phone)
    biz_list=[]
    if phone_list==None:
        phone_list=[]
        temp_phone_list=mysqldao.select(db_name, tb_insp_norm, ['PHONE'])
        for p in temp_phone_list:
            phone_list.append(p[0])
    exist_phone_list=[]
    exist_phone_tuple = mysqldao.select_unique_column(db_name,tb_log_phone,'phone')
    count = 0
    for phone in phone_list:
        biz_data={}
        biz_list=[]
        phone=str(phone).replace(' ','').replace('_','')
        if len(phone) ==11:
            phone = phone[1:]
        if phone not in exist_phone_tuple:
            url_params={
            "phone":phone,
            'ywsid':'bxtstnNlHgO8c6W4X2yuYA'
            }
            biz_data = yelp_api.request(API_HOST, '/phone_search', url_params=url_params)[BUSINESS]
            if len(biz_data) != 0:
                data_phone=dic_look_up(biz_data[0],'phone')
                if data_phone == phone:
                    biz_list.append(biz_data[0])
            print "Phone:",phone,"count",len(biz_list),'data'
            tran_list=json_transform_phone(biz_list,columns_list)
            load_data_db(db_name, tb_yelp_phone, tran_list)
            extracttime = str(datetime.datetime.now())[0:19]
            log_para_list = [{'extracttime':extracttime, 'phone':phone}]
            mysqldao.insert(db_name,tb_log_phone,log_para_list)
def get_review_html(bizid, start):
    url = BIZ_URL + bizid + PAGE_OFFSET + str(start)
    print url
    response = requests.get(url).text.encode("utf-8")
    linelist = str(response).split("\n")
    index = yelp_web.target_line_range(linelist, yelp_web.beginReviewReg, yelp_web.endReviewReg)
    reviewlist = linelist[index[0] : index[1]]
    review_dict_list = yelp_web.review_match(reviewlist, bizid)
    extracted = len(review_dict_list)
    if extracted > 0:
        mysqldao.insert(dbname, tb_yelp_review, review_dict_list)
    return extracted
def get_review_html(bizid, start):
    url = BIZ_URL + bizid + PAGE_OFFSET + str(start)
    print url
    response = requests.get(url).text.encode('utf-8')
    linelist = str(response).split("\n")
    index = yelp_web.target_line_range(linelist, yelp_web.beginReviewReg,
                                       yelp_web.endReviewReg)
    reviewlist = linelist[index[0]:index[1]]
    review_dict_list = yelp_web.review_match(reviewlist, bizid)
    extracted = len(review_dict_list)
    if extracted > 0:
        mysqldao.insert(dbname, tb_yelp_review, review_dict_list)
    return extracted
예제 #7
0
def load_data_db(dbname, tbname, data_entry_list):
    id_key = 'id'
    insert_list = []
    update_list = []
    exist_id_tuple = mysqldao.select_unique_column(dbname, tbname, id_key)
    for data in data_entry_list:
        if data[id_key] in exist_id_tuple:
            update_list.append(data)
        else:
            insert_list.append(data)
    #print 'Update exist '+str(len(update_list))+" data."
    #mysqldao.update(dbname, tbname, '', ['id'], update_list)
    #print 'Insert new '+str(len(insert_list))+" data."
    mysqldao.insert(dbname, tbname, insert_list)
    return len(insert_list)
def load_data_db(dbname, tbname, data_entry_list):
    id_key='id'
    insert_list=[]
    update_list=[]
    exist_id_tuple=mysqldao.select_unique_column(dbname, tbname, id_key)
    for data in data_entry_list:
        if data[id_key] in exist_id_tuple:
            update_list.append(data)
        else:
            insert_list.append(data)
    #print 'Update exist '+str(len(update_list))+" data."
    #mysqldao.update(dbname, tbname, '', ['id'], update_list)
    #print 'Insert new '+str(len(insert_list))+" data."
    mysqldao.insert(dbname,tbname,insert_list)
    return len(insert_list)
예제 #9
0
dbname='dwdproject'
tb_yelp_user='******'


USER_URL='http://www.yelp.com/user_details?userid='
target_user_list_query="""
    SELECT userid from dwdproject.log_user
    WHERE userid not in (select userid from dwdproject.yelp_user)
    """
target_user_list_query2="""SELECT userid, count(userid)  num 
from dwdproject.yelp_review
WHERE userid not in
(select userid from dwdproject.yelp_user)
group by userid
ORDER by num desc
"""
#target_user='******'
print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
target_user_list=mysqldao.execute_query(target_user_list_query)
for user in target_user_list:
	userid = user[0]
	url=USER_URL+str(userid)
	response=requests.get(url).text.encode('utf-8')
	linelist=str(response).split("\n")
	index=yelp_web.target_line_range(linelist, yelp_web.beginUserAttrReg, yelp_web.endUserAttrReg)
	attr_dict=yelp_web.user_attribute(linelist[index[0]:index[1]])
	attr_dict['userid']=userid
	mysqldao.insert(dbname,tb_yelp_user,[attr_dict])
	print "=======Insert user", userid