def parse(): response = request.get(start_url) if response: html = etree.HTML(response.content) urls = html.xpath("//div[@class='link-n clearfix']/a/@href") for url in urls: response = request.get(url) get_airport(response)
def parse(): response = request.get(openrice_url) if response: html = etree.HTML(response.content) districts = html.xpath( "//div[@class='flex-wrap js-flex-wrap']/div[contains(@class, 'btn')]/@data-tag" ) for district in districts[94:167]: print('%s---开始' % district) for i in range(1, 18): if district.startswith('所有'): continue district = district.replace(' (', '-').replace(')', '') url = openrice_cuisine_url % (district, i) response = request.get(url) if response: html = etree.HTML(response.content) shops = html.xpath("//div[@class='content-cell-wrapper']") for shop in shops: shop_item = item.copy() name = shop.xpath( ".//h2[@class='title-name']/a/text()")[0] if exist(name): print(name + '已经存在') continue recomm_dish = get_node_list( shop.xpath( ".//ul[@class='dish-list']/li[@class='dish']/text()" )) book_status = get_book_status( shop.xpath( ".//a[@class='info-button info-offer-button']/text()" )) preferential = get_node_list( shop.xpath( ".//span[@class='info-text info-offer-text']/text()" )) shop_url = shop.xpath( ".//h2[@class='title-name']/a/@href")[0] shop_item['vendor_name'] = name shop_item['recomm_dish'] = recomm_dish shop_item['book_status'] = book_status shop_item['preferential'] = handle_emjoy(preferential) shop_detail(shop_url, shop_item) print('第%s页完成' % i) else: print('请求失败%s' % url) print('%s完成' % district)
def get_business_hours(shop_id): url = 'https://www.openrice.com/api/poi/status?uiCity=hongkong&uiLang=zh-cn&poiId=%s' % shop_id response = request.get(url) if response: opening_info = {} business_hours = json.loads(response.content) opening_info['business_hours'] = business_hours open_tiem = set() if business_hours: for item in business_hours.get('openingHourInfo', {}).get("normalHours", []): for i in item.get('times', {}): time_list = i['timeDisplayString'].split('-') if len(time_list) > 1: start, end = time_list if end == '00:00': end = '24:00' open_tiem.update([start, end]) time_digit = [ float(i.replace(':', '.')) for i in open_tiem ] opening_info['open_time'] = str( min(time_digit)).replace('.', ':') opening_info['close_time'] = str( max(time_digit)).replace('.', ':') return opening_info
def get_shop_url(url, name, headers): response = request.get(url, headers=headers) if response: html = etree.HTML(response.content) shop_names = html.xpath("//div[@class='txt']/div[@class='tit']/a") # and shop_names[0].xpath('h4/text()')[0] == name if shop_names: title = shop_names[0].xpath('h4/text()')[0] shop_url = shop_names[0].xpath('@href')[0] avg_price = html.xpath("//a[@class='mean-price']/b/text()")[0] \ if html.xpath("//a[@class='mean-price']/b/text()") else '' cuisine = html.xpath("//span[@class='tag']/text()")[0] \ if html.xpath("//span[@class='tag']") else '' tag_addr = html.xpath("//div[@class='tag-addr']/a") if len(tag_addr) > 2: tag_addr = tag_addr[1].xpath("span[@class='tag']/text()")[0] else: tag_addr = '' addr = html.xpath("//div[@class='tag-addr']/span[@class='addr']/text()")[0] \ if html.xpath("//div[@class='tag-addr']/span[@class='addr']/text()") else '' preferences = html.xpath("//div[@class='svr-info']/a[contains(@class,'tuan')]/text()")[0] \ if html.xpath("//div[@class='svr-info']/a[contains(@class,'tuan')]") else '' item = { 'title': title, 'shop_url': shop_url, 'avg_price': avg_price, 'cuisine': cuisine, 'tag-addr': tag_addr, 'addr': addr, 'preferences': preferences } print(item)
def get_json(type,kw,url): params = '' response = request.get(url,params=params) if response: return response.json() else: return None
def parse(item): url = ctrip_search_shop_url % item['vendor_name'] response = request.get(url) if response: html = etree.HTML(response.content) box = html.xpath("//div[@class='rdetailbox']") if box: box = box[0] name = box.xpath("./dl/dt/a/text()")[0].decode('utf-8').lower() name = switch_lang.Traditional2Simplified(name) address = box.xpath("./dl/dd[@class='ellipsis']/text()")[0].decode('utf-8').lower() address = switch_lang.Traditional2Simplified(address) check_name = item['vendor_name'].lower() check_name = switch_lang.Traditional2Simplified(check_name) check_address = item['address'].lower() check_address = switch_lang.Traditional2Simplified(check_address) item['ctrip_address'] = address if name.find(check_name) == -1: return if address.find(check_address) == -1: flag = False else: flag = True shop_url = box.xpath("./dl/dt/a/@href")[0] get_shop_detail(shop_url, item, flag) else: print('请求失败:%s'%url)
def get_positons_list(url,item,cookies): response = request.get(url,cookies=cookies) if response: html = etree.HTML(response.content) if html.xpath('//title/text()')[0] == '找工作-互联网招聘求职网-拉勾网': print(url + ' error ') return get_positions_urls(response,item,cookies) html = etree.HTML(response.content) page_num = html.xpath("//span[@class='span totalNum']/text()") page_num = int(page_num[0]) if page_num else 1 if page_num > 1: for num in range(2,page_num+1): list_url = '%s%d/'%(url,num) response = request.get(list_url,cookies=cookies) get_positions_urls(response,item,cookies)
def get_positions_urls(response,item,cookies): if response: html = etree.HTML(response.content) print(html.xpath('//title/text()')[0] if html.xpath('//title/text()') else 'title error') item_list = html.xpath("//ul[@class='item_con_list']/li") for position in item_list: publish_date = position.xpath(".//span[@class='format-time']/text()")[0] publish_date = switch_publish_date(publish_date) url = position.xpath(".//a[@class='position_link']/@href")[0] # 判断url是否存在 if not db_operate.isexist_url(url): position_name = position.xpath("@data-positionname")[0] salary = position.xpath("@data-salary")[0] other = position.xpath(".//div[@class='li_b_l']/text()")[2].strip() add = position.xpath(".//span[@class='add']/em/text()")[0] city = add.split('·')[0] company_name = position.xpath("@data-company")[0] item['position_name'] = position_name item['publish_date'] = publish_date item['salary'] = salary item['education'] = other.split('/')[1] item['work_year'] = other.split('/')[0][2:] item['city'] = city item['company_name'] = company_name item['url'] = url response = request.get(url,cookies=cookies) get_position_detail(response,item) else: print('此url%s已经存在!'%url)
def parse(): for page in range(1000, 3300): print('%s页---开始' % page) response = request.get(quna_url % page) if response: html = etree.HTML(response.content) shop_list = html.xpath( "//ul[@class='list_item clrfix']/li[@class='item']") for shop in shop_list: shop_item = item.copy() name = shop.xpath(".//span[@class='cn_tit']/text()")[0] score_details = shop.xpath( "//span[@class='cur_score']/text()")[0] if exist(name): print(name + '已经存在') continue sub_info = shop.xpath( ".//div[@class='sublistbox']/dl[@class='sublist_item clrfix']" ) get_sub_info(sub_info, shop_item) shop_url = shop.xpath("./a[@data-beacon='poi']/@href")[0] shop_item['vendor_name'] = name shop_item['score_details'] = score_details shop_item['vendor_url'] = shop_url shop_detail(shop_url, shop_item) print('第%s页完成' % page) else: print('请求失败%s' % shop_url)
def start_spider(): response = request.get(start_url) if response: html = etree.HTML(response.content) name = names.next() headers['Cookie'] = headers['Cookie'] + str(start_num.next()) for name in names: get_shop_url(second_url % name, name, headers) else: print(response)
def get_dish(city, shop_id): dish_url = 'http://www.dianping.com/overseas/shop/ajax/allReview?categoryURLName=food&power=5&shopType=10\ &shopId=%s&cityId=%s&cityEnName=%s' % (shop_id, city.get('code'), city.get('name')) response = request.get(dish_url) body = json.loads(response.text) dish = body.get('dishTagStrList', []) if dish == None: dish = [] dish = ';'.join(dish) print(dish)
def get_dianping_city(): response = request.get(city_list_url) if response: citys = json.loads(response.content) with open('data/dianping_city.json','w') as f: f.write(json.dumps(citys,ensure_ascii=False,indent=2)) city_dict = {} for i in citys.get('cityMap').values(): for city in i: city_dict[city.get('cityName')] = city.get('cityId') return city_dict
def tripadvisor(item): url = tripadvisor_url % item['vendor_name'] response = request.get(url) if response: html = etree.HTML(response.content) title = html.xpath("//div[@class='title']/span/text()") address = html.xpath("//div[@class='address']/text()") if title and address: name = item['vendor_name'].lower() address2 = item['address'].lower().replace('楼', '') if title[0].lower().find(name) == -1 or address[0].lower().find( address2) == -1: pass else: url = html.xpath("//div[@class='title']/@onclick") url = re.findall(r'\'(.*?)\'', url[0])[3] url = tripadvisor_host + url response = request.get(url) if response: html = etree.HTML(response.content) rows = html.xpath("//div[@class='row']") for row in rows: row_title = row.xpath("./div[@class='title']/text()") if row_title: row_title = row_title[0].strip() row_content = row.xpath( "./div[@class='content']//text()") row_content = handle_node(row_content) if row_title == '菜系': item['tripadvisor_cuisine'] = row_content elif row_title == '餐厅特点': item['tripadvisor_character'] = row_content elif row_title == '就餐氛围': item['environment'] = row_content item['tripadvisor_url'] = url else: print('请求失败' + url) else: print('请求失败' + url) save_date(item)
def update_price(): # sql1 = "select vendor_id,name,price from vendor_miqilin where vendor_city='香港'" # result_raws = dbmysql.fetchall(sql1) # print(len(result_raws)) # return # for item in result_raws: # sql2 = "select price from price where name=:name" # price = dbmysql.first(sql2,params={'name':item[1]}) # if price: # pass # # sql3 = "update vendor_miqilin set price = :price where vendor_id=:vendor_id" # # dbmysql.edit(sql3,params={'price':price[0],'vendor_id':item[0]}) # else: # print(item[1]) # sql = "select name,price from price" # names = dbmysql.fetchall(sql) # for item in names: # sql2 = "select name,price from vendor_miqilin where vendor_city='纽约' and name=:name" # flag = dbmysql.first(sql2,params={'name':item[0]}) # if not flag: # print (str(item[0])+':'+str(item[1])) # 香港 sql1 = "select vendor_id,name,vendor_url,vendor_city from vendor_miqilin_price" results = dbmysql.fetchall(sql1) print(len(results)) city_list = get_dianping_city() city_list[u'纽约'] = 2395 city_list[u'伦敦'] = 2464 # print([item[0] for item in results]) for item in results: shop_url = item[2] shop_id = shop_url.split('/')[-1] city_code = city_list.get(item[3]) avg_price_url = 'http://www.dianping.com/overseas/shop/ajax/reviewAndStar?shopId=%s&cityId=%s\ &mainCategoryId=102' % (shop_id,city_code) response = request.get(avg_price_url) price = get_avg_price(response) # sql2 = "select name,price from price_detail where name=:name" # price = dbmysql.first(sql2,{'name':item[1]}) if price: print(price) sql3 = "update vendor_miqilin_price set price = :price,flag = 0 where vendor_id=:vendor_id" dbmysql.edit(sql3, params={'price': price, 'vendor_id': item[0]}) else: sql3 = "update vendor_miqilin_price set flag = 1 where vendor_id=:vendor_id" dbmysql.edit(sql3, params={'vendor_id': item[0]}) print(str(item[0]) + ' ' + item[1])
def get_dish(url, item, headers, cookies): ''' :param response: :return: 获取菜单 ''' response = request.get(url, headers=headers, cookies=cookies) body = json.loads(response.text) dish = body.get('dishTagStrList', []) if dish == None: dish = [] dish = ';'.join(dish) item['dish'] = dish return item
def shop_detail(url, item): response = request.get(url) if response: html = etree.HTML(response.content) description = handle_node( html.xpath("//div[@class='e_db_content_box']/p/text()")) business_hours = handle_node( html.xpath("//dl[@class='m_desc_right_col']/dd/span/p/text()")) phone = handle_node( html.xpath("//td[@class='td_l']/dl[2]/dd/span/text()")) item['description'] = description item['business_hours'] = business_hours item['phone'] = phone else: print('请求失败%s' % url) save_date(item)
def get_shop(url, item, cookies, city): response = request.get(url, headers=headers, cookies=cookies) if response: html = etree.HTML(response.text) names = html.xpath("//div[@class='tit']/a/@title") name = names[0] if names else '' if name == item['name']: shop_url = html.xpath("//div[@class='tit']/a/@href") if shop_url: shop_url = shop_url[0] shop_id = shop_url.split('/')[-1] item['url'] = shop_url # headers['Referer'] = shop_url # shopinfo_url = 'http://www.dianping.com/ajax/json/shopDynamic/basicHideInfo?shopId=%s'%shop_id get_shopinfo(shop_url, item, response.cookies, city, shop_id) return save(item)
def get_shop_detail(url, item, flag): url = ctrip_host + url response = request.get(url) if response: html = etree.HTML(response.content) description = get_one(html.xpath("//div[@itemprop='description']/text()")) dish = get_shop_dish(html.xpath("//div[@class='text_style']/p/text()")) price = get_one(html.xpath("//em[@class='price']/text()")).replace('¥','') if flag: item['description'] = description else: item['ctrip_description'] = description item['ctrip_dish'] = switch_lang.Traditional2Simplified(dish) item['ctrip_url'] = url item['ctrip_price'] = price save_data(item) else: print('shop 请求失败%s'%url)
def update_latlng(): sql = 'select * from vendor_miqilin_hongkong where vendor_id' data = dbmysql.fetchall(sql) for item in data: address = item['address'] result = request.get(baidu_address%(address,'香港')) result = json.loads(result.content) if result.get('status') == 0: update_sql = 'update vendor_miqilin set lat=:lat,lng=:lng where vendor_id=:vendor_id' params = { 'lat':result['result']['location'].get('lat'), 'lng':result['result']['location'].get('lng'), 'vendor_id': item['vendor_id'] } dbmysql.edit(update_sql, params) else: update_sql = 'update vendor_miqilin set lat_flag = 1 where vendor_id=:vendor_id' dbmysql.edit(update_sql,{'vendor_id': item['vendor_id']}) print(str(item['vendor_id']))
def start_spider(): response = request.get(start_url) if response: cookies = response.cookies html = etree.HTML(response.content) menu = html.xpath("//div[@class='menu_sub dn']")[0] positions_dict = {} types = menu.xpath("dl") for item in types: type_name = item.xpath("dt/span/text()")[0] # print(type_name) positions_dict[type_name] = {} positions = item.xpath("dd/a") for position in positions: position_name = position.text position_url = position.xpath('@href')[0] positions_dict[type_name][position_name] = position_url position_data = {'first_type':position_name,'second_type':type_name} get_positons_list(position_url,position_data,cookies)
def parse(): for i in range(1,24): url = start_url + '?page=%d'%i response = request.get(url) if response: html = etree.HTML(response.content) shops = html.xpath("//li[@class='poi-item poi-item-restaurant']") for shop in shops: name = shop.xpath("div[@class='poi-item-name truncate']/a/text()")[0] row_price = shop.xpath(".//div[@class='poi-item-price']//text()") row_price = [i for i in row_price if i.strip()] price = int(row_price[1][1:]) + int(row_price[-1][1:]) price = int(round(price/2 * 6.3279)) row_price = ' '.join(row_price) item = dict( name = name, price = price, row_price = row_price ) sql = 'insert into price(name,price,row_price) values(:name,:price,:row_price)' dbmysql.edit(sql,item)
def shop_detail(url, item): url = mafengwo_host + url item['vendor_url'] = url headers = { 'user-agent': user_agent, } response = request.get(url, headers=headers) if response: html = etree.HTML(response.content) description = html.xpath( "//div[@class='tips']/p[@style='max-height: 3.9em;overflow: hidden;']/text()" ) sub_info = html.xpath( "//div[@class='tips']/p[@style='max-height: 2.5em;overflow: hidden;']" ) get_sub_info(sub_info, item) address = html.xpath("//div[@class='maps']/ul[@class='context']/li") get_sub_info(address, item) item['description'] = handle_desctrip(description) else: print('请求失败%s' % url) save_date(item)
def start_parse(city): response = request.get(start_url, headers=headers) if response: names = get_names(city.get('name')) cookies = response.cookies for name in names: url = dianping_shop_url % (city.get('code'), name) item = {} item['name'] = name item['links'] = url item['book_status'] = '' item['facilities'] = '' item['description'] = '' item['pyment'] = '' item['low_price'] = '' item['detail_price'] = '' item['price'] = '' item['michelin_star'] = '' item['cuisine'] = '' item['service_time'] = '' item['address'] = '' item['phone'] = '' item['district'] = '' item['landmark'] = '' item['quality'] = '' item['brand'] = '' item['recomm_dishes'] = '' item['taste'] = '' item['characters'] = '' item['category'] = '' item['people_group'] = '' item['lat'] = '' item['lng'] = '' item['country'] = '' item['city'] = city.get('name') item['other_name'] = '' item['dish'] = '' get_shop(url, item, cookies, city)
def parse(): for page in range(22, 3850): print('%s页---开始' % page) headers = { 'user-agent': user_agent, 'x-requested-with': 'XMLHttpRequest', } response = request.get(mafengwo_url % page, headers=headers) if response: try: html = json.loads(response.content) except Exception as e: print(e) html = {} html = etree.HTML(html.get('html', '')) shop_list = html.xpath("//section[@class='poi-list']/div") for shop in shop_list: shop_item = raw_item.copy() name = shop.xpath( "./a[@class='poi-li']/div[@class='hd']/text()")[0] score_details = shop.xpath( ".//div[@class='star']/span/@style")[0] shop_url = shop.xpath("./a[@class='poi-li']/@href")[0] cuisine = shop.xpath(".//p[@class='m-t']/strong/text()") comment = shop.xpath(".//div[@class='comment']/text()") if exist(name): print(name + '已经存在') continue shop_item['vendor_name'] = name shop_item['score_details'] = score_details.replace( 'width:', '').replace('%;', '') shop_item['cuisine'] = cuisine[0].replace( ' ', '') if cuisine else '' shop_item['comment'] = comment[0] if comment else '' shop_detail(shop_url, shop_item) print('第%s页完成' % page) else: print('请求失败%s' % shop_url)
def get_shopinfo(url, item, cookies, city, shop_id): response = request.get(url, headers=headers, cookies=cookies) # 菜单的URL shopId: cityId=342(澳门) 341(香港) cityEnName:macau hongkong categoryURLName=food&power=5&shopType=10 dish_url = 'http://www.dianping.com/overseas/shop/ajax/allReview?categoryURLName=food&power=5&shopType=10\ &shopId=%s&cityId=%s&cityEnName=%s' % (shop_id, city.get('code'), city.get('name')) item = get_dish(dish_url, item, headers, cookies) if response and response.text: html = etree.HTML(response.text) address = html.xpath( "//div[@class='expand-info address']/span[@class='item']/text()") item['address'] = address[0].strip() if address else '' phone = html.xpath( "//p[@class='expand-info tel']/span[@class='item']/text()") item['phone'] = phone[0] if phone else '' intents = html.xpath("//p[@class='info info-indent']") for intent in intents: info_name = intent.xpath("span[@class='info-name']/text()") info = intent.xpath(".//span[@class='item']/text()") info_name = info_name[0] if info_name else '' info = info[0] if info else '' if info_name == '别 名:': item['other_name'] = info.strip() elif info_name == '营业时间:': item['service_time'] = info.strip() elif info_name == '餐厅简介:': item['description'] = info.strip() banner = html.xpath("//div[@class='breadcrumb']/a/text()") item['district'] = banner[1].strip() if len(banner) > 1 else '' item['cuisine'] = banner[2].strip() if len(banner) > 2 else '' price = html.xpath("//span[@id='avgPriceTitle']/text()") item['price'] = price[0].split(':')[1][:-1] if price else '' headers['Referer'] = url save(item) else: save(item)
def parse(): for page in range(1, 285): page_url = start_url % page time.sleep(random.randint(1, 3)) response = request.get(page_url) get_airport(response)
def shop_detail(url, item): url = openrice_host + url shop_id = url.rsplit('-r')[-1] response = request.get(url) if response: text = response.text html = etree.HTML(response.content) text = text.replace('\n', '').replace('\r', '') text = text.decode('utf-8') dish = re.search(r'主要菜式包括 (.*?), 。'.decode('utf-8'), text) if dish: dish = dish.group(1).replace(', ', ';') item['dish'] = dish application_json = re.search(r'"application/ld\+json">(.*?)</script>', text) if application_json: try: application_json = application_json.group(1) application_json = json.loads(application_json) cuisine = application_json.get('servesCuisine', '') price_range = application_json.get('priceRange', '') phone = application_json.get('telephone', '') url = application_json.get('url', '') lat = application_json['geo']['latitude'] lng = application_json['geo']['longitude'] district = application_json['address']['addressLocality'] address = district + application_json['address'][ 'streetAddress'] item['cuisine'] = cuisine item['price_range'] = price_range item['price'] = handle_price(price_range) item['price_class'] = get_price_class(item['price']) item['phone'] = phone item['lat'] = lat item['lng'] = lng item['district'] = district item['address'] = address item['openrich_url'] = url except Exception as e: print(e) description = handle_node( html.xpath( "//section[@class='introduction-section']/div[@class='content js-text-wrapper']/text()" )) characters = handle_node( html.xpath( "//section[@class='good-for-section']/div[@class='content']/text()" )) payment = handle_node( html.xpath( "//div[@id='pois-filter-expandable-features']//div[@class='comma-tags']//text()" )) facilities = handle_node( html.xpath( "//span[@class='or-sprite-inline-block d_sr2_lhs_tick_desktop']/following-sibling::span/text()" )) none_facilities = html.xpath( "//span[@class='or-sprite-inline-block d_sr2_lhs_cross_desktop']/following-sibling::span/text()" ) category = handle_node( html.xpath( "//div[@class='header-poi-categories dot-separator']/div/a[contains(@href,'/type/')]/text()" )) score_details = html.xpath( "//div[@class='header-score-details-right-item']") score_details = get_score_details(score_details) business_hours = get_business_hours(shop_id) if business_hours: item['business_hours'] = json.dumps( business_hours['business_hours'], ensure_ascii=False, indent=2) item['open_time'] = business_hours.get('open_time') item['close_time'] = business_hours.get('close_time') if handle_book(facilities.split(';')): item['book_status'] = 0 if handle_no_book(none_facilities): item['book_status'] = 2 item['description'] = description item['characters'] = characters item['payment'] = payment item['facilities'] = facilities item['category'] = category item['score_details'] = json.dumps(score_details, ensure_ascii=False, indent=2) # print(item) tripadvisor(item) else: print('请求失败%s' % url)