def elong_parser(content, url, other_info): hotel = HotelNewBase() try: root = HTML.fromstring(content.decode('utf-8')) phantom_js = execjs.get('PhantomJS') js_str = root.xpath('//script[contains(text(),"window.newDetailController")]/text()')[0] page_js = phantom_js.compile(js_str[js_str.index('window.newDetailController'):][:-1]) except: try: js_str = root.xpath('//script[contains(text(),"HotelDetailController")]/text()')[0] page_js = phantom_js.compile(js_str[js_str.index('HotelDetailController'):][:-1]) except: pass #print str(e) # return hotel # pass # 解析酒店中英文名,如果没有中文名则置为英文名,如果都解析失败则退出 try: # temp_name = root.find_class('t24 yahei')[0].xpath('./text()')[0].strip().encode('utf-8') temp_name = root.xpath('//div[@class="t24"]/@title')[0].strip().encode('utf-8') k = temp_name.find('(') j = temp_name.find(')') hotel.hotel_name = temp_name[:k] hotel.hotel_name_en = temp_name[k + 1:j] except: try: hotel.hotel_name = root.find_class('hrela_name-cn')[0].xpath('./text()')[0].strip() hotel.hotel_name_en = root.find_class('hrela_name-en')[0].xpath('./text()')[0].strip() except: #print(str(e)) # return hotel_tuple pass # 中英文名相同时只保留一个 if hotel.hotel_name == hotel.hotel_name_en: if isinstance(hotel.hotel_name, str): hotel_name = hotel.hotel_name else: hotel_name = hotel.hotel_name.decode('utf8') if any(map(lambda x: u'\u4e00' <= x <= u'\u9fa5', hotel_name)): hotel.hotel_name_en = 'NULL' else: hotel.hotel_name = 'NULL' #print('hotel.hotel_name=>%s' % hotel.hotel_name) # #print hotel.hotel_name #print('hotel.hotel_name_en=>%s' % hotel.hotel_name_en) # #print hotel.hotel_name_en #print('brand=>%s' % hotel.brand_name) # #print hotel.brand_name # 解析酒店地址 try: # hotel.address = root.find_class('mr5 left')[0].xpath('./text()')[0].strip().encode('utf-8').spilt(':')[1] temp = root.xpath('//span[@class="mr5 left"]/text()') hotel.address = temp[0].encode('utf-8').strip().split(':')[1] # special chinese colon except: #print(e) hotel.address = 'NULL' if hotel.address == 'NULL': try: hotel.address = root.xpath('//span[@class="icon-address"]/text()')[0].replace('地址:', '').strip() except: #print(e) hotel.address = 'NULL' #print('hotel.address=>%s' % hotel.address) # #print hotel.address try: lat = re.findall(r'"lat":"([-+\d\.]*)"', content)[0] lon = re.findall(r'"lon":"([-+\d\.]*)"', content)[0] # map_infos = map_pat.findall(content)[0] hotel.map_info = '{},{}'.format(lon, lat) except: try: map_infos = page_js.eval('HotelDetailController').get('AjaxHotelInfo',{}).get('HotelGeoInfo',{}) lat = map_infos.get('Lat',None) lon = map_infos.get('Long',None) hotel.map_info = '{0},{1}'.format(lon,lat) raise hotel.map_info == 'None,None' except: hotel.map_info = 'NULL' #print traceback.format_exc(e) #print 'map_info=>%s' % hotel.map_info # #print hotel.map_info # 解析酒店星级 try: # star_temp = root.find_class('t24 yahei')[0].xpath('b/@class')[0].encode('utf-8') star_temp = root.xpath('//b[contains(@class, "icon_stars")]/@class')[0].encode('utf-8') hotel.star = star_temp[-1] if hotel.star == ' ': hotel.star = -1 except: try: star_temp = page_js.eval('window.newDetailController').get('RecommendHotelRequest',{}).get('starLevel','') if json.loads(star_temp): hotel.star = json.loads(star_temp)[0] except: hotel.star = -1 #print 'star=>%s' % hotel.star # #print hotel.star # 解析酒店评分 try: grade = page_js.eval('window.newDetailController').get('scoreInfo', {}).get('comment_score', '') hotel.grade = grade except: try: grade = root.xpath('//div[@id="hover-hrela"]/p[1]') hotel.grade = float(re.search(r'[0-9\.]+', grade[0].text).group(0)) except: try: # tp = root.xpath('//div[@class="pertxt_num"]/text()')[0].encode('utf-8') tp = root.xpath('//div[contains(@class, "pertxt_num")]/text()')[0].encode('utf-8') # t_grade = grade_pat.findall(tp)[0] # #print 't_grade', t_grade hotel.grade = float(tp) # float(t_grade) * 0.05 except: hotel.grade = 'NULL' #print 'grade=>%s' % hotel.grade # #print hotel.grade # 解析酒店评论数 try: review_num_str = page_js.eval('window.newDetailController').get('scoreInfo', {}).get('comment_count', '') hotel.review_num = review_num_str except: try: # review_num_str = root.find_class('hrela_comt_total')[0]. \ # xpath('a/text()')[0].encode('utf-8').strip() # #print review_num_str review_num_str = root.find_class('fl sum-txt')[0].text_content().strip().encode('utf-8') hotel.review_num = int(grade_pat.findall(review_num_str)[0]) except: hotel.review_num = -1 #print 'review=>%s' % hotel.review_num # #print hotel.review_num # 解析酒店简介 try: p_tags = root.find_class('dview_info')[0].xpath('dl[1]/dd/p') description = '' for p in p_tags: b_text = p.xpath('./b/text()') # title p_text = p.xpath('./text()') # description if len(b_text): description += b_text[0].strip().decode('utf-8') + ':' + p_text[1].strip().decode('utf-8') + '|' hotel.description = description[:-1].encode('utf-8') if hotel.description == '': hotel.description = p_tags[0].text_content().strip().encode('utf-8') except: hotel.description = 'NULL' #print 'description=>%s' % hotel.description # #print hotel.description # parse check_in time info , check out time info try: temp_time = root.xpath('//div[@id="iscrollNewAmenities"]/div/dl/dd/text()')[0]. \ encode('utf-8').strip() #print temp_time hotel.check_in_time = temp_time.split(',')[0] k = temp_time.find('退房时间:') if k != -1: hotel.check_out_time = temp_time[k + 15:] except: hotel.check_out_time = 'NULL' #print 'check_in=>%s' % hotel.check_in_time # #print hotel.check_in_time #print 'check_out=>%s' % hotel.check_out_time # #print hotel.check_out_time # parse all services at this hotel accept_card = None try: service = '' accept_card = [] service_list = root.xpath('//*[@id="serverall"]/li/text()') for each in service_list: service += each.encode('utf-8').strip() + '|' if '卡' in each: accept_card.append(each.strip()) hotel.service = service[:-1] except: hotel.service = 'NULL' if accept_card: hotel.accepted_cards = '|'.join(accept_card).encode('utf-8') #print 'hotel.service=>%s' % hotel.service #print 'hotel.accept_cards=>%s' % hotel.accepted_cards # #print hotel.service first_img = None try: pattern_img = root.xpath('//div[@class="newdetaiL-img imgMore"]/@style')[0] first_img = re.search(r'url\(([^)]+)\)', pattern_img).group(1) except: #print e pass #others_info信息 #print 'first_img=>%s' % first_img city_name = 'NULL' try: city_name = page_js.eval('window.newDetailController')['Region']['RegionName'] except: #print e pass #print city_name hotel.city = city_name hotel.others_info = json.dumps({'city_name': city_name, 'first_img': first_img, 'hid':other_info.get('hid', 'NULL')}) #获取source_city_id source_city_id = 'NULL' try: pattern_city_id = root.xpath('//p[@class="link555 t12"]/a[contains(@href,"region")]/@href')[0] source_city_id = re.search(r'[0-9]+',pattern_city_id).group() except: #print e pass # hotel.source_city_id = source_city_id #print "hotel.source_city_id",hotel.source_city_id # #print "hotel.others_info:",hotel.others_info # if '免费自助停车设施' in hotel.service: # hotel.is_parking_free = 'Yes' # hotel.has_parking = 'Yes' # if '收费自助停车设施' in hotel.service: # hotel.has_parking = 'Yes' # hotel.is_parking_free = 'No' # if '免费 Wi-Fi' in hotel.service: # hotel.has_wifi = 'Yes' # hotel.is_wifi_free = 'Yes' #print 'has_parking=>%s' % hotel.has_parking # #print hotel.has_parking #print 'is_parking_free=>%s' % hotel.is_parking_free # #print hotel.is_parking_free #print 'has_wifi=>%s' % hotel.has_wifi # #print hotel.has_wifi #print 'has_free_wifi=>%s' % hotel.is_wifi_free # #print hotel.is_wifi_free img_items = '' try: img_list = root.xpath('//ul[@class="hrela_spic_list"]/li/img/@src') for img_src in img_list: if '306' in img_src: img_src = img_src.replace('306', '307') img_items += img_src + '|' hotel.img_items = img_items[:-1] base_url = page_js.eval('window.newDetailController').get('BaseUrl') base_url = urljoin(base_url,'ihotel_848_470_all/') if not img_items: keys = page_js.eval('window.newDetailController').get('HotelImageTagList',{}).get("urlList",{}).keys() img_lists =[] for key in keys: img_list = page_js.eval('window.newDetailController').get('HotelImageTagList',{}).get("urlList",{}).get(key,{}).get('tagUrlList',{}) img_lists.extend(img_list.values()) img_lists = [base_url+img for img in img_lists] hotel.img_items = '|'.join(img_lists).encode('utf-8') except: hotel.img_items = 'NULL' #print 'img_items=>%s' % hotel.img_items # #print hotel.img_items if url.startswith('http://hotel'): try: hotel_obj = page_js.eval('HotelDetailController') lat = hotel_obj.get('googleLat', None) lon = hotel_obj.get('googleLng', None) hotel.map_info = '{0},{1}'.format(lon, lat) hotel.hotel_name = hotel_obj.get('hotelNameCn') hotel.hotel_name_en = hotel_obj.get('hotelNameEn') hotel.address = hotel_obj.get('hotelAddress') hotel.city = hotel_obj.get('cityNameCn') or hotel_obj.get('cityNameEn') hotel.grade = hotel_obj.get('starLevel') hotel.has_wifi = hotel_obj.get('hasWifi') hotel.source_city_id = hotel_obj.get('cityId') first_img = hotel_obj.get('hasWifi') hotel.others_info['first_img'] = first_img except: pass info_list = hotel.address.split(' ') hotel.country = info_list[-1] hotel.source = 'elong' hotel.hotel_url = url if other_info.get('hid'): hotel.source_id = re.search('/(\d+)/', url).groups()[0] # hotel.source_id = re.search('RegionId: ?"(\d+)"', content).groups()[0] else: hotel.source_id = other_info['source_id'] hotel.city_id = other_info['city_id'] hotel.others_info = json.dumps({ 'hotel_services_info':hotel.service, },ensure_ascii=False) # others_info_dict = hotel.__dict__ # if first_img: # others_info_dict['first_img'] = first_img # hotel.others_info = json.dumps(others_info_dict) # if first_img: # del others_info_dict['first_img'] # #print hotel return hotel
def holiday_parser(content, url, other_info): """ 酒店详情的爬虫 :param content: 包含3个或2个content的元组,分别为json和xml和json格式,其中第三个json可选,主要用来抓酒店的英文名 :param url: 酒店详情页的url :param other_info: 包含city_id, source_id 的字典 :return: 返回一个HotelBase的实例 """ hotel = HotelNewBase() detail = {} if len(content) == 3: content1, content2, content3 = content try: en_json = json.loads(content3) detail['hotel_name_en'] = en_json['hotelInfo']['profile']['name'] except: pass else: content1, content2 = content re_match = re.search('/hotels/cn/zh/(\w+)/hoteldetail', url) hotel_code = re_match.group(1) if re_match else '' # with open('igh.html', 'w') as f: # f.write(content2) resp = json.loads(content1)['hotelInfo'] hotel.hotel_url = url hotel.hotel_name = resp.get('profile', '').get('name', '') hotel.hotel_name_en = detail.get('hotel_name_en', '') hotel.source = 'holiday' hotel.source_id = other_info.get('source_id', '') or hotel_code # hotel.source_city_id = other_info.get('source_city_id', '') hotel.brand_name = resp.get('brandInfo', '').get('brandName', '') hotel.map_info = str(resp.get('profile', '').get( 'longitude', '')) + ',' + str( resp.get('profile', '').get('latitude', '')) hotel.address = get_all_street(resp) hotel.city = resp.get('address', '').get('city', '') hotel.country = resp.get('address', '').get('country', '').get('name', '') hotel.city_id = other_info.get('city_id', '') hotel.postal_code = resp.get('address', '').get('zip', '') hotel.star = '-1' hotel.grade = resp.get('profile', '').get('averageReview', '') hotel.review_num = resp.get('profile', '').get('totalReviews', '') hotel.check_in_time = resp.get('policies', '').get('checkinTime', '') hotel.check_out_time = resp.get('policies', '').get('checkoutTime', '') first_img = resp.get('profile', '') if first_img: first_img = first_img.get('primaryImageUrl', '') if first_img: first_img = first_img.get('originalUrl', '') hotel.Img_first = first_img hotel.description = resp.get('profile', '').get( 'longDescription', '') + '\n' + resp.get('profile', '').get( 'shortDescription', '') # detail['has_wifi'] = 'Yes' if any([u'无线互联网' in ''.join(i.values()) or 'wifi' in ''.join(i.values()) for i in # resp.get('facilities', '')]) else detail.get('has_wifi', 'Null') # detail['service'] = detail.get('service', '') + get_api_server(resp) facilities_dict = { 'Swimming_Pool': '泳池', 'gym': '健身', 'SPA': 'SPA', 'Bar': '酒吧', 'Coffee_house': '咖啡厅', 'Tennis_court': '网球场', 'Golf_Course': '高尔夫球场', 'Sauna': '桑拿', 'Mandara_Spa': '水疗中心', 'Recreation': '儿童娱乐场', 'Business_Centre': '商务中心', 'Lounge': '行政酒廊', 'Wedding_hall': '婚礼礼堂', 'Restaurant': '餐厅', 'Parking': '停车', 'Airport_bus': '机场班车', 'Valet_Parking': '代客泊车', 'Call_service': '叫车服务', 'Rental_service': '租车服务', 'Room_wifi': '无线互联网', 'Room_wired': '有线互联网', 'Public_wifi': '无线互联网', 'Public_wired': '有线互联网' } reverse_facility_dict = {v: k for k, v in facilities_dict.items()} service_dict = { 'Luggage_Deposit': '行李寄存', 'front_desk': '24小时前台', 'Lobby_Manager': '24小时大堂经理', '24Check_in': '24小时办理入住', 'Security': '24小时安保', 'Protocol': '礼宾服务', 'wake': '叫醒服务', 'Chinese_front': '中文前台', 'Postal_Service': '邮政服务', 'Fax_copy': '传真/复印', 'Laundry': '洗衣服务', 'polish_shoes': '擦鞋服务', 'Frontdesk_safe': '保险', 'fast_checkin': '快速办理入住', 'ATM': '自动柜员机(ATM)/银行服务', 'child_care': '儿童看护', 'Food_delivery': '送餐服务' } reverse_sevice_dict = {v: k for k, v in service_dict.items()} facilities = resp.get("facilities", "") for each in facilities: if each['id'] == 'NO_PETS_ALLOWED' or each['id'] == 'PETS_ALLOWED': hotel.pet_type = each['name'] for fac_value in facilities_dict.values(): if fac_value in each['name']: hotel.facility_content[ reverse_facility_dict[fac_value]] = each['name'] for ser_value in service_dict.values(): if ser_value in each['name']: hotel.service_content[ reverse_sevice_dict[ser_value]] = each['name'] fea_str = get_api_server(resp) tree = etree.HTML(content2) ser_str = get_ota_server(tree, '上网', '互联网', '泳', '退房', '餐', '预定', '停车', '健身', '运动', '泳池', '特色', '服务') hotel_services_info = fea_str + ser_str hotel.others_info = json.dumps({ 'city': detail.get('city', ''), 'country': detail.get('country', ''), 'first_img': first_img, 'source_city_id': other_info.get('source_city_id', ''), 'hotel_services_info': hotel_services_info }) hotel.img_items = get_all_pics(tree) # content_list = tree.xpath("//div[@class='accordian-content']/li/div[@class='header']/h2/span/text()") # index = 1 # for content in content_list: # if content == "停车": # parking_list = tree.xpath("//div[@class='accordian-content']/li[{}]/div[@class='item-content']/ul/li/text()".format(index)) # hotel.facility_content['Parking'] = " ".join(parking_list) # if content == "宠物政策": # pet_list = tree.xpath("//div[@class='accordian-content']/li[{}]/div[@class='item-content']/ul/li/text()".format(index)) # hotel.pet_type = " ".join(pet_list) # index += 1 hotel.hotel_zip_code = hotel.postal_code # try: # hotel.hotel_phone = tree.xpath("//div[@class='resdirect-num tel-no']/span/a/text()")[0] # except Exception as e: # hotel.hotel_phone = "NULL" res = hotel.to_dict() # res = json.loads(res) # print json.dumps(res, ensure_ascii=False) return res
def gha_parser(total_content, url, other_info): hotel = HotelNewBase() hotel.city_id = other_info.get("city_id", "NULL") select = etree.HTML(total_content) info = re.compile("pins\.gha_hotel\.push\((.*?)\)", re.S) address = re.compile( "<script type=\"application/ld\+json\">(.*?)</script>", re.S) address = json.loads(address.findall(total_content)[0].replace(' ', '')) info = json.loads(info.findall(total_content)[0]) hotel.hotel_name = info["title"] hotel.hotel_name_en = address["name"] hotel.source = "gha" hotel.source_id = info["id"] hotel.brand_name = info["brand_name"] hotel.map_info = str(info["lon"]) + "," + str(info["lat"]) hotel.address = ''.join(select.xpath("//adress/text()")).strip() hotel.country = address["address"]["addressCountry"] hotel.city = address["address"]["addressLocality"] hotel.postal_code = address["address"]["postalCode"] hotel.star = '5' hotel.Img_first = select.xpath( "//div[@class='FlexEmbed-item']/span/img/@src") hotel.hotel_phone = address.get("telephone", 'NULL') hotel.hotel_zip_code = address["address"]["postalCode"] service = select.xpath('//ul[@class="prop-Amenities"]/li/span/text()') servicestr = ''.join(service) description = select.xpath("//div[@id='content-about-hotel']/p/text()") hotel.description = ''.join(description) if u'无线' in servicestr: hotel.facility["Room_wifi"] = u'无线上网' hotel.facility["Public_wifi"] = u'无线上网' if u'泳' in servicestr: hotel.facility["Swimming_Pool"] = u'泳池' if u'健身' in servicestr: hotel.facility["gym"] = u"健身中心" if u'水疗' in servicestr: hotel.facility['Mandara_Spa'] = u"水疗中心" if u'酒吧' in hotel.description: hotel.facility["Bar"] = u'酒吧' if u'儿童俱乐部' in hotel.description: hotel.facility["Recreation"] = u"儿童俱乐部" if u'餐' in servicestr: hotel.facility["Restaurant"] = u"餐饮" if u'商务中心' in servicestr: hotel.facility["Business_Centre"] = u'商务中心' if u'亲子' in servicestr: hotel.feature["Parent_child"] = u'亲子' img_list = select.xpath('//div[@class="RotateBanner-itemImg"]/span/@style') imgurl = re.compile("url\('(.*?)'\)") imgurl_list = [] for img in img_list: imgurl_list.append(imgurl.findall(img)[0]) hotel.img_items = '|'.join(imgurl_list) hotel.check_in_time = '14:00' hotel.check_out_time = '12:00' reviewsurl = re.compile('<script src="//(.*?)"') urls = reviewsurl.findall(total_content) if urls[0]: reviewsurl = "http://" + urls[0] else: hotel.grade = '0.0' hotel.review_num = 0 hotel.hotel_url = url return hotel.to_dict() comment = requests.get(reviewsurl).content grade = re.compile('<div class=\\\\"rating-value\\\\">\\\\n(.*?)%', re.S) try: hotel.grade = str(float(grade.findall(comment)[0].strip()) / 10) except: hotel.grade = '0.0' review = re.compile('<div class=\\\\"review-count\\\\">\\\\n(.*?)reviews', re.S) try: hotel.review_num = review.findall(comment)[0].strip() except: hotel.review_num = 0 hotel.hotel_url = url # print room_tuple print hotel.to_dict() return hotel.to_dict()
def booking_parser(content, url, other_info): hotel = HotelNewBase() try: root = HTML.fromstring(content) except Exception as e: print e.message hotel.hotel_name = re.findall(r'b_hotel_name:.*?\'(.+?)\',', content)[0].strip() hotel.hotel_name_en = re.findall(r'hotelName:.*?\"(.+?)\",', content)[0].strip() hotel.source = 'booking' hotel.source_id = other_info['source_id'] latitude = re.findall(r'b_map_center_latitude = (.*?);', content)[0].strip() longitude = re.findall(r'b_map_center_longitude = (.*?);', content)[0].strip() hotel.map_info = '{},{}'.format(latitude, longitude) location_dict = json.loads( re.findall(r'<script type="application/ld\+json">(.*?)</script>', content, re.S)[0].replace('\n', '').strip()) hotel.address = location_dict['address']['streetAddress'] hotel.city = re.findall(r'city_name:.*?\'(.*?)\'', content)[0].strip() hotel.country = location_dict['address']['addressCountry'] hotel.city_id = other_info['city_id'] hotel.postal_code = re.findall(r'"postalCode".*?\"(.*?)\"', content, re.S)[0].strip() try: hotel.star = root.xpath( '//*[@id="wrap-hotelpage-top"]/div[@class="hp__hotel-title"]/span/span[@class="hp__hotel_ratings__stars nowrap"]/i/@title' )[0].encode('utf-8').replace('星级酒店', '') except IndexError as e: print('Parser ERROR, NO Star Infomation.The reason follows: %s' % e.message) hotel.grade = location_dict['aggregateRating']['ratingValue'] hotel.review_num = location_dict['aggregateRating']['reviewCount'] hotel.Img_first = location_dict['image'] # hotel.other_info = # hotel.hotel_phone = # hotel_zip_code = # hotel.feature = # hotel.brand_name = # hotel.continent = try: hotel.traffic = ','.join([ root.xpath('//*[@id="public_transport_options"]/div/text()') [1].strip('\n').strip(), root.xpath( '//*[@id="public_transport_options"]/ul/li/div[1]/text()') [1].strip('\n').strip(), root.xpath( '//*[@id="public_transport_options"]/ul/li/div[2]/text()') [0].strip('\n').strip() ]) except IndexError as e: print('Parser ERROR, NO Traffic Infomation.The reason follows: %s' % e.message) # hotel.chiled_bed_type = '\n'.join(root.xpath('//*[@id="children_policy"]/p[position()>1]/text()')) hotel.chiled_bed_type = ''.join([ i.replace('\n', '').strip() for i in root.xpath( '//*[@id="children_policy"]/p[position()>1]//text()|//*[@id="general-child-policy"]/p[position()>1]//text()' ) if i.replace('\n', '').strip() ]) hotel.pet_type = ''.join([ i.replace('\n', '').strip() for i in root.xpath( '//*[@id="hotelPoliciesInc"]/div[@class="description"]/p[position()>1]//text()' ) if i.replace('\n', '').strip() ]) # -2:宠物 1:综合设施 2:活动设施 3:服务项目 5:浴室 6:媒体/科技 7:餐饮服务 11:网络 13:户外 16:停车场 17:卧室 # 21:游泳及康复设施 27:商务设施 hot_facilities = [ i.replace('\n', '').strip() for i in root.xpath( '//*[@id="hp_facilities_box"]/div[@class="facilities-sliding-keep"]/div/div[@class="important_facility "]//text()' ) if i.replace('\n', '').strip() ] wifi = ''.join([ i.replace('\n', '').strip() for i in root.xpath( '//*[@id="hp_facilities_box"]//div[@data-section-id=11]/ul/li[@class="policy"]/p/span//text()' ) if i.replace('\n', '').strip() ]) if u'免费无线网络连接' in hot_facilities or u'免费!住宿方于各处提供WiFi(免费)。' in wifi: hotel.facility_content['Public_wifi'] = wifi elif u'免费!住宿方于客房提供WiFi(免费)。' in wifi: hotel.facility_content['Room_wifi'] = wifi elif u'客房' in wifi and u'有线网络' in wifi: hotel.facility_content['Room_wired'] = wifi elif u'公共' in wifi or u'各处' in wifi and u'有线网络' in wifi: hotel.facility_content['Public_wired'] = wifi parking = ''.join([ i.replace('\n', '').strip() for i in root.xpath( '//*[@id="hp_facilities_box"]//div[@data-section-id=16]//p//text()' ) if i.replace('\n', '').strip() ]) hotel.facility_content['Parking'] = parking # 设施新字段添加到facilities_dict, 即可自动匹配 facilities_dict = { 'Swimming_Pool': ['游泳池'], 'gym': ['健身房'], 'SPA': ['SPA'], 'Bar': ['酒吧'], 'Coffee_house': ['咖啡厅'], 'Tennis_court': ['网球场'], 'Golf_Course': ['高尔夫球场'], 'Sauna': ['桑拿'], 'Mandara_Spa': ['水疗中心'], 'Recreation': ['儿童娱乐场', '儿童游乐场'], 'Business_Centre': ['商务中心'], 'Lounge': ['行政酒廊'], 'Wedding_hall': ['婚礼礼堂'], 'Restaurant': ['餐厅'], 'Airport_bus': ['机场班车', '班车服务', '班车服务(收费)'], 'Valet_Parking': ['代客泊车'], 'Call_service': ['叫车服务'], 'Rental_service': ['租车服务'] } part_facilities = map( lambda x: x.encode('utf-8').replace('\n', '').strip(), root.xpath( '//*[@id="hp_facilities_box"]/div[@class="facilitiesChecklist"]/div/ul/li/span[@data-name-en]/text()' )) parser_list = [] # reverse_facility_dict = {v: k for k, v in facilities_dict.items()} # print reverse_facility_dict for every in part_facilities: value = every.replace('咖啡', '咖啡厅').replace('网球', '网球场').replace( '健身', '健身房').replace('儿童娱乐', '儿童游乐').upper() for keys, faci in facilities_dict.items(): for fac in faci: if fac in value: if keys in hotel.facility_content: hotel.facility_content[ keys] = hotel.facility_content[keys] + ',' + every else: hotel.facility_content[keys] = every parser_list.append(every) print('酒店设施:{}'.format(', '.join(part_facilities))) print('已解析出:%s' % ', '.join(parser_list)) service_list = map( lambda x: x.encode('utf-8').replace('\n', '').strip(), root.xpath( '//*[@id="hp_facilities_box"]//div[@data-section-id=3]/ul/li/span[1]/text()' )) # 服务新字段添加到facilities_dict, 即可自动匹配 service_dict = { 'Luggage_Deposit': '行李寄存', 'front_desk': '24小时前台', 'Lobby_Manager': '24小时大堂经理', '24Check_in': '24小时办理入住', 'Security': '24小时安保', 'Protocol': '礼宾服务', 'wake': '叫醒服务', 'Chinese_front': '中文前台', 'Postal_Service': '邮政服务', 'Fax_copy': '传真/复印', 'Laundry': '洗衣服务', 'polish_shoes': '擦鞋服务', 'Frontdesk_safe': '前台保险柜', 'fast_checkin': '快速办理入住/退房', 'ATM': '自动柜员机(ATM)/银行服务', 'child_care': '儿童看护服务', 'Food_delivery': '送餐服务' } reverse_sevice_dict = {v: k for k, v in service_dict.items()} parser_sevice_list = [] for every in part_facilities: for serv in service_dict.values(): value = serv.replace('服务', '') if value in every: hotel.service_content[reverse_sevice_dict[serv]] = every parser_sevice_list.append(every) print('酒店服务:{}'.format(', '.join(service_list) or '如果你看见了这句话请不要好奇,它表示酒店服务项目是空的')) print('已解析出:%s' % ', '.join(parser_sevice_list)) hotel.img_items = '|'.join( root.xpath('//*[@id="photos_distinct"]/a[position()<last()-1]/@href')) if not hotel.img_items: hotel.img_items = '|'.join( root.xpath('//div[@class="bh-photo-grid-thumb-cell"]/a/@href')) hotel.description = '\n'.join( map(lambda x: x.strip(), root.xpath('//*[@id="summary"]/p/text()'))) a = root.xpath( '//*[@class="jq_tooltip payment_methods_overall"]/button/@aria-label|' '//div[contains(@class, "payment_promotion_labels")]/label/span/text()' ) hotel.accepted_cards = '|'.join(a) hotel.check_in_time = re.sub( pattern=r'<script.+?script>', repl='', string=root.xpath('//*[@id="checkin_policy"]/p/span/@data-caption') [0].encode('utf-8'), flags=re.S).strip() hotel.check_out_time = re.sub( pattern=r'<script.+?script>', repl='', string=root.xpath('//*[@id="checkout_policy"]/p/span/@data-caption') [0].encode('utf-8'), flags=re.S).strip() hotel.hotel_url = url.encode('utf-8') print json.dumps(hotel.to_dict(), ensure_ascii=False) return hotel.to_dict()
def parse_hotel(self, req, resp): hotels = [] # hotel = Hotel() hotel = Hotel_New() # hotel = BaseModel() hotel.hotel_name = 'NULL' hotel.hotel_name_en = self.hotel_test['hotel_name_en'] hotel.source = 'hyatt' hotel.source_id = self.hotel_test['source_id'] hotel.brand_name = 'NULL' hotel.map_info = self.hotel_test['map_info'] hotel.address = self.hotel_test['address'] hotel.city = self.hotel_test['hotel_city'] hotel.country = self.hotel_test['hotel_country'] hotel.postal_code = self.hotel_test['hotel_postal_code'] hotel.star = 5 hotel.grade = 'NULL' hotel.review_num = 'NULL' # hotel.has_wifi = self.hotel_test['has_wifi'] # hotel.is_wifi_free = self.hotel_test['is_wifi_free'] # hotel.has_parking = 'NULL' # hotel.is_parking_free = 'NULL' # hotel.service = self.hotel_test['services'] # hotel.img_items = self.hotel_test['img_items'] # hotel.description = ''.join(self.hotel_test['description']) hotel.Img_first = self.hotel_test['Img_first'] hotel.hotel_phone = self.hotel_test['hotel_phone'] hotel.hotel_zip_code = self.hotel_test['hotel_postal_code'] hotel.traffic = '' hotel.chiled_bed_type = self.hotel_test['chiled_bed_type'] hotel.pet_type = '' if self.hotel_test['has_wifi']: hotel.facility['Room_wifi'] = self.hotel_test['has_wifi'] for one in self.hotel_test['services']: one = one.lower() if 'faxing' in one: hotel.service['Fax_copy'] = one elif 'postal' in one: hotel.service['Postal_Service'] = one elif 'laundry' in one: hotel.service['Laundry'] = one elif 'room service' in one: hotel.service['Food_delivery'] = one elif 'concierge service' in one: hotel.service['Protocol'] = one elif 'babysitting' in one: hotel.service['child_care'] = one elif 'shoeshine' in one: hotel.service['polish_shoes'] = one elif 'valet parking' in one: hotel.facility['Valet_Parking'] = one elif 'parking' in one: hotel.facility['Parking'] = one elif 'wifi' in one or 'wi-fi' in one: hotel.facility['Room_wifi'] = one elif 'pool' in one: hotel.facility['Swimming_Pool'] = one elif 'gym' in one: hotel.facility['gym'] = one elif 'bar' in one: hotel.facility['Bar'] = one elif 'coffee' in one: hotel.facility['coffee'] = one elif 'parking' in one: hotel.facility['Parking'] = one elif 'spa' in one: hotel.facility['SPA'] = one elif 'golf' in one: hotel.facility['Golf_Course'] = one elif 'restaurant' in one: hotel.facility['Restaurant'] = one elif 'sauna' in one: hotel.facility['Sauna'] = one elif 'service to airport' in one or 'shuttle airport' in one: hotel.facility['Airport_bus'] = one elif 'wedding' in one: hotel.facility['Wedding_hall'] = one elif 'restaurant' in one: hotel.facility['Restaurant'] = one elif 'business centre' in one: hotel.facility['Business_Centre'] = one elif 'sereno Spa' in one: hotel.facility['Mandara_Spa'] = one elif 'tennis' in one: hotel.facility['Tennis_court'] = one elif 'spa' in one: hotel.facility['SPA'] = one elif "China_Friendly" in one: hotel.feature['China_Friendly'] = one elif "Romantic_lovers" in one: hotel.feature['Romantic_lovers'] = one elif "Parent_child" in one: hotel.feature['Parent_child'] = one elif "Beach_Scene" in one: hotel.feature['Beach_Scene'] = one elif "Hot_spring" in one: hotel.feature['Hot_spring'] = one elif "Japanese_Hotel" in one: hotel.feature['Japanese_Hotel'] = one elif "Vacation" in one: hotel.feature['Vacation'] = one hotel.accepted_cards = 'NULL' hotel.check_in_time = self.hotel_test['check_in_time'] hotel.check_out_time = self.hotel_test['check_out_time'] hotel.hotel_url = self.url_en # hotel_tuple = dict( # hotel_name=hotel.hotel_name, # hotel_name_en=hotel.hotel_name_en, # source=hotel.source, # source_id=hotel.source_id, # brand_name=hotel.brand_name, # map_info=hotel.map_info, # address=hotel.address, # city=hotel.city, # country=hotel.country, # postal_code=hotel.postal_code, # star=hotel.star, # grade=hotel.grade, # review_num=hotel.review_num, # has_wifi=hotel.has_wifi, # is_wifi_free=hotel.is_wifi_free, # has_parking=hotel.has_parking, # is_parking_free=hotel.is_parking_free, # service=hotel.service, # img_items=hotel.img_items, # description=hotel.description, # accepted_cards=hotel.accepted_cards, # check_in_time=hotel.check_in_time, # check_out_time=hotel.check_out_time, # hotel_url=hotel.hotel_url, # ) # hotels.append(hotel_tuple) # return hotels res = hotel.to_dict() res = json.loads(res) # print json.dumps(res,ensure_ascii=False) return res
def agoda_parser(content, url, other_info): hotel = HotelNewBase() try: content = content.decode('utf-8') root = HTML.fromstring(content) except: #print str(e) pass ph_runtime = execjs.get('PhantomJS') page_js = ph_runtime.compile( root.xpath('//script[contains(text(),"propertyPageParams")]/text()') [0]) page_params = page_js.eval('propertyPageParams') try: hotel_name = page_params['hotelInfo']['name'] except: try: hotel_name = root.xpath('//*[@id="hotelname"]/text()')[0].encode( 'utf-8').strip() except: try: hotel_name = root.xpath('//title/text()')[0].split('-')[0][:-1] except: #print str(e) pass try: k = hotel_name.find('(') # #print k hotel.hotel_name = hotel_name[:k if k != -1 else None] except: # #print str(e) hotel.hotel_name = 'NULL' #print 'hotel_name=>%s' % hotel.hotel_name # #print hotel.hotel_name try: hotel.hotel_name_en = hotel_name[ k + 1 if k != -1 else None:-1 if k != -1 else None] except: hotel.hotel_name_en = 'NULL' # #print str(e) #print 'hotel.hotel_name_en=>%s' % hotel.hotel_name_en # #print hotel.hotel_name_en try: if page_params['hotelInfo']['address']['address'] in page_params[ 'hotelInfo']['address']['full']: hotel.address = page_params['hotelInfo']['address']['full'] else: hotel.address = page_params['hotelInfo']['address'][ 'address'] + page_params['hotelInfo']['address']['full'] except: hotel.address = "NULL" #print 'hotel.address=>%s' % hotel.address try: hotel.star = int( page_params['hotelInfo']['starRating']['icon'].split('-')[-1]) except: hotel.star = -1 if hotel.star > 5: if hotel.star % 5 == 0: hotel.star = int(hotel.star / 10) else: hotel.star = -1 #print 'hotel.star=>%s' % hotel.star try: lat_pat = re.compile(r'latitude\" content=(.*?) \/>', re.S) lon_pat = re.compile(r'longitude\" content=(.*?) \/>', re.S) lon_text = lon_pat.findall(content)[0][1:-1] lat_text = lat_pat.findall(content)[0][1:-1] hotel.map_info = lon_text + ',' + lat_text except: # #print str(e) hotel.map_info = 'NULL' #print 'map_info=>%s' % hotel.map_info try: hotel.grade = float(page_params['reviews']['score']) except: try: hotel.grade = root.find_class('review-score-value')[0].text except: try: hotel.grade = page_params['masterRoomInfo'][0]['demographics'][ 'grades'][0]['score'] except: hotel.grade = -1 #print 'grade=>%s' % hotel.grade try: hotel.review_num = page_params['reviews']['reviewsCount'] except: try: review_num = root.find_class('review-based-on-section')[0].xpath( './strong/text()')[0].encode('utf8').strip() hotel.review_num = review_num_pat.findall(review_num)[0] except: try: hotel.review_num = page_params['masterRoomInfo'][0][ 'demographics']['count'] except: hotel.review_num = -1 #print 'hotel.review_num=>%s' % hotel.review_num try: first_img = page_params.get("mosaicInitData", {}).get('images', [])[0].get('Location', 'NULL') first_img = urljoin('http:', first_img) except: first_img = 'NULL' try: hotel.img_items = '|'.join( filter( lambda x: 'hotel' in x, map(lambda x: 'http:' + x['Location'].split('?')[0], page_params['mosaicInitData']['images']))).encode('utf-8') except: try: img_lists = [] for img in page_params['masterRoomInfo']: img_lists.extend(img['images']) hotel.img_items = '|'.join( map(lambda x: urljoin('http:', x), img_lists)).encode('utf-8') except: try: img_list = '|'.join([ image for images in page_params['roomGridData']['masterRooms'] for image in images['images'] ]) hotel.img_items = img_list except: try: img_json = images_url_pat.findall(content)[0] location_pat = re.compile(r'"Location":"(.*?)",', re.S) img_list = location_pat.findall(img_json) hotel.img_items = '|'.join( map(lambda x: 'http:' + x, img_list)) except: hotel.img_items = 'NULL' #print 'img_items=>%s' % hotel.img_items try: hotel.hotel_url = url except: pass try: service_url = "https://www.agoda.com/api/zh-cn/Hotel/AboutHotel?hotelId={0}".format( page_params['hotelId']) json_data = json.loads(requests.get(service_url).content) hotel.service = '|'.join([ feature['name'] for features in json_data['featureGroups'] for feature in features['feature'] if feature['available'] ]).encode('utf-8') except: try: hotel.service = '|'.join([ service['text'].strip() for services in page_params['featuresYouLove']['features'] for service in services ]) except: # hotel.service = '|'.join() hotel.service = 'NULL' #print 'hotel.service=>%s' % hotel.service try: hotel.description = json_data['hotelDesc']['overview'].strip().replace( '<BR>', '').encode('utf-8') except: hotel.description = 'NULL' #print 'hotel.description=>%s' % hotel.description # hotel.check_in_time = None # hotel.check_out_time = None try: for checkInOut in json_data['usefulInfoGroups']: if '入住/退房' in checkInOut['name']: for item in checkInOut['items']: if '入住办理起始' in item['title']: hotel.check_in_time = item['description'] break for item in checkInOut['items']: if '退房办理截止' in item['title']: hotel.check_out_time = item['description'] break break except: pass if hotel.check_in_time == 'NULL' and hotel.check_out_time == 'NULL': try: in_and_out = json_data.get("CheckInOutInfo", {}) hotel.check_in_time = in_and_out.get("CheckInAndOutTime", {}).get( "CheckInTime", {}).get("From", {}).get("Description") hotel.check_out_time = in_and_out.get("CheckInAndOutTime", {}).get( "CheckOutTime", {}).get("Until", {}).get("Description") except: pass #print "hotel.check_in_time:", hotel.check_in_time #print "hotel.check_out_time:", hotel.check_out_time # 从酒店页面获取城市信息 try: country_id = page_params['hotelSearchCriteria']['countryId'] country_name = page_params['hotelInfo']['address']['countryName'] city_name = page_params['hotelInfo']['address']['cityName'] city_id = page_params['hotelInfo']['address']['cityId'] except: country_id = 'NULL' country_name = 'NULL' city_name = 'NULL' city_id = 'NULL' #print e # pass hotel.others_info = json.dumps( { 'country_id': country_id, 'country_name': country_name, 'city_name': city_name, 'city_id': city_id, 'first_img': first_img, 'hid': other_info.get('hid'), 'hotel_services_info': hotel.service }, ensure_ascii=False) # hotel.source_city_id = city_id hotel.country = page_params['hotelInfo'].get('address', {}).get('countryName', '') hotel.city = page_params['hotelInfo'].get('address', {}).get('cityName', '') #print "hotel.others_info:", hotel.others_info #print "hotel.source_city_id:", hotel.source_city_id hotel.accepted_cards = 'NULL' #print "accepted_cards:", hotel.accepted_cards #print "check_in_time:", hotel.check_in_time #print "check_out_time:", hotel.check_out_time # if '无线网络' in hotel.service: # hotel.has_wifi = 'Yes' # if '免费房内无线网络' in hotel.service: # hotel.is_wifi_free = 'Yes' # if 'free wi-fi' in hotel.service.lower() or 'wi-fi free' in hotel.service.lower(): # hotel.has_wifi = 'Yes' # hotel.is_wifi_free = 'Yes' # if '停车场' in hotel.service: # hotel.has_parking = 'Yes' # if '停车场免费' in hotel.service or 'parking free' in hotel.service: # hotel.is_parking_free = 'Yes' #print 'hotel.has_wifi=>%s' % hotel.has_wifi # #print hotel.has_wifi #print 'hotel.is_wifi_free=>%s' % hotel.is_wifi_free # #print hotel.has_wifi #print 'hotel.has_parking=>%s' % hotel.has_parking # #print hotel.has_parking #print 'hotel.is_parking_free=>%s' % hotel.is_parking_free hotel.source = 'agoda' hotel.hotel_url = url.encode('utf-8') if other_info.get('hid'): hotel.source_id = re.search('hotelId: ?(\d+),', content).groups()[0] # hotel.source_id = re.search('cityId: ?(\d+),', content).groups()[0] else: hotel.source_id = other_info['source_id'] hotel.city_id = other_info['city_id'] # others_info_dict = hotel.__dict__ # hotel.others_info = json.dumps(others_info_dict) # #print hotel return hotel
def bestwestern_parser(content, url, other_info): lng_lat = content[0] html = etree.HTML(content[1]) hotel = HotelNewBase() # 酒店名 hotel.hotel_name = html.xpath( '//div[contains(@class,"hotelImagebloc")]//h1[@id="hotel-name"]/a/text()' )[0] # 酒店英文名 hotel.hotel_name_en = hotel.hotel_name # 酒店源 hotel.source = 'bestwestern' # 酒店id hotel.source_id = url.split('-')[-1] # 酒店品牌名 hotel.brand_name = get_brand_name(html) # 酒店经纬度 hotel.map_info = get_map_info(lng_lat) # 酒店地址 hotel.address = "".join( html.xpath( '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span/text()' )) # 酒店所在城市 hotel.city = html.xpath( '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span[@id="address-1-city-state-zip"]/text()' )[0] # 酒店所在国家 hotel.country = html.xpath( '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]/span' )[-1].text # 城市ID(mioji) hotel.city_id = other_info['city_id'] # 酒店邮编 hotel.postal_code = html.xpath( '//div[contains(@class,"hotelImagebloc")]//div[contains(@class,"addressContainer")]//span[@class="postalCode"]/text()' )[0] # 酒店星级 hotel.star = 5 # 酒店评分 hotel.grade = html.xpath('//div[@class="tripAdvisorOwl"]/img/@src' )[0].split("/")[-1].split('-')[0] # 酒店评论数 try: hotel.review_num = re.search( r'\d+', html.xpath( '//div[@class="hotelDetailsContainer"]//div[@id="hotel-reviews"]//div[@class="reviewRatingCount"]/text()' )[0]).group() except Exception: hotel.review_num = 0 # 酒店头图 hotel.Img_first = html.xpath( "//div[contains(@class, 'hotelImageSlider')]//li/img/@src")[0] # 酒店电话 hotel.hotel_phone = html.xpath( '//div[@class="phoneNumbers"]//p[@class="phoneNumber"]/a/text()')[0] # 酒店邮编 hotel.hotel_zip_code = html.xpath( '//div[@class="phoneNumbers"]//p[@class="phoneNumber"]/a/text()')[1] # 到达酒店的交通信息 hotel.traffic = 'NULL' # 儿童和加床政策 hotel.chiled_bed_type = 'NULL' # 宠物政策 hotel.pet_type = html.xpath( '//div[@class="policyContent uk-margin-small-left"]/text()')[0] # 酒店特色 get_feature(hotel, html) # 设施信息 get_facility(hotel, html) # 服务信息 get_service(hotel, html) # 酒店照片 hotel.img_items = ",".join( html.xpath("//div[contains(@class, 'hotelImageSlider')]//li/img/@src")) # 酒店描述 hotel.description = html.xpath( '//div[@class="hotelOverviewDetailSection"]/div[@class="overviewText"]/text()' )[0].strip() # 支付接受的卡 hotel.accepted_cards = 'NULL' # 入住时间 hotel.check_in_time = html.xpath( '//div[@class="uk-width-3-10 checkInPositionContainer addressCheckInTableCell"]/p[2]/text()' )[0] # 退房时间 hotel.check_out_time = html.xpath( '//div[@class="phoneNumbers"]/div[contains(@class,"phonesRow")][1]/div[2]/p[2]/text()' )[0] # 酒店url hotel.hotel_url = url hotel_service_info = __get_hotel_service(html) hotel.others_info = json.dumps({"hotel_services_info": hotel_service_info}) print hotel.to_dict() # with open("bestwestren.json", 'a') as f: # f.write(hotel.to_dict() + "\n") return hotel.to_dict()