def HtmlRoom(self, page, area, filename, list): Dict = defaultdict(dict) room = Room() for every in list: md5 = hashlib.md5(str(every).encode('utf-8')).hexdigest() try: con_res = self.com_HDfunc(page, every, filename, area, 'hotel_name') if con_res == 'error': Dict[md5]['hotel_name'] = room.hotel_name else: Dict[md5]['hotel_name'] = con_res except Exception, e: Dict[md5]['hotel_name'] = room.hotel_name try: con_res = self.com_HDfunc(page, every, filename, area, 'city') if con_res == 'error': Dict[md5]['city'] = room.city else: Dict[md5]['city'] = con_res except Exception, e: Dict[md5]['city'] = room.city
def parseRoom(content, check_in, check_out, hotel_id): room_info = [] room = Room() content = content.replace('\n', '') content = replace_char(content) if content == '' or len(content) < CONTENT_LEN: return room_info try: each_type_room_list = each_room_type_pat.findall(content) if len(each_type_room_list) == 0: return room_info except Exception, e: return room_info
def JsonRoom(self,filename, list): Dict = defaultdict(dict) room = Room() for every in list: md5 = hashlib.md5(str(every).encode('utf-8')).hexdigest() Dict[md5]['currency'] = self.GetConfValues(filename, 'section', 'currency') Dict[md5]['source'] = self.GetConfValues(filename, 'section', 'source') try: roomtypepath = self.GetConfValues(filename, 'section', 'room_type') Dict[md5]['room_type'] = self.GetUnder(every, roomtypepath) except Exception, e: Dict[md5]['room_type'] = room.room_type try: hotel_namepath = self.GetConfValues(filename, 'section', 'hotel_name') Dict[md5]['hotel_name'] = self.GetUnder(every, hotel_namepath) except Exception, e: Dict[md5]['hotel_name'] = room.hotel_name
def parseRoom(content, hotel_name, city_name_zh, check_in, check_out, hotel_id): room_list = [] if content == '' or len(content) < 100: return room_list try: content_json = json.loads(content)['value']['hotelRoomList'] except Exception, e: logger.info('elongHotelParser: Cannot load json' + str(e)) return room_list for each_hotel in content_json: room = Room() try: room_type = str(each_hotel['RoomName']) num_temp1 = room_type.find(',') if num_temp1 > 0: room.room_type = room_type[:num_temp1] else: room.room_type = room_type except Exception, e: logger.error('Cannot paese room type of this hotel!' + str(e)) try: room.source_roomid = each_hotel['RoomId'] except Exception, e: logger.info('Cannot parse this room id with error: ' + str(e))
return url def parseRoom(content, hotel_id, hotel_name, city, check_in, check_out): rooms = [] try: all_info = all_room_info_pat.findall(content)[0] each_room_info_list = each_room_info_pat.findall(all_info) print len(each_room_info_list) time.sleep(3) except Exception, e: logger.error('Can not parse rooms info!' + str(e)) return rooms for each_room_info in each_room_info_list: room = Room() room.hotel_name = hotel_name.replace('_', ' ') room.city = city room.source_hotelid = hotel_id room.currency = 'CNY' room.source = 'biyi' room.check_in = check_in room.check_out = check_out try: room.real_source = real_source_pat.findall(each_room_info)[0] room.price = price_pat.findall(each_room_info)[0].replace( ' ', '').replace(',', '') except Exception, e: #logger.error('Can not parse important info of this room! Detail: ' + str(e)) return rooms
star = taskcontent.split('&')[2] ipathid = taskcontent.split('&')[1] city = taskcontent.split('&')[3] country = taskcontent.split('&')[4] from_date_temp = taskcontent.split('&')[5] from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \ + from_date_temp[6:] to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \ int(from_date_temp[6:8])) to_date = str(to_date_temp + datetime.timedelta(days = 1))[:10] except Exception,e: logger.info('youzhanHotel: Wrong Content Format with %s'%taskcontent) result['error'] = TASK_ERROR return result room = Room() price_url = get_price_url(hotel_id,ipathid,from_date,to_date) i = 0 content_len = 0 while i < 5 and content_len < CONTENT_LEN: #p = get_proxy() p = get_proxy(source='youzhanHotel') #print p if p == None: result['error'] = PROXY_NONE return result url = price_url + str(int(time.time() * 1000)) price_page = crawl_single_page(url,proxy=p,n=1) content_len = len(price_page)
return result def parseRoom(content,hotel_name,city_name_zh,check_in,check_out,hotel_id): room_list = [] if content == '' or len(content) < 100: return room_list try: content_json = json.loads(content)['value']['hotelRoomList'] except Exception, e: logger.info('elongHotelParser: Cannot load json' + str(e)) return room_list for each_hotel in content_json: room = Room() try: room_type = str(each_hotel['RoomName']) num_temp1 = room_type.find(',') if num_temp1 > 0: room.room_type = room_type[:num_temp1] else: room.room_type = room_type except Exception,e: logger.error('Cannot paese room type of this hotel!' + str(e)) try: room.source_roomid = each_hotel['RoomId'] except Exception, e: logger.info('Cannot parse this room id with error: ' + str(e))
def youzhan_task_parser(taskcontent): all_info = [] room_list = [] taskcontent = taskcontent.encode('utf-8').strip() hotel_id = taskcontent.split('&')[0] star = taskcontent.split('&')[2] ipathid = taskcontent.split('&')[1] city = taskcontent.split('&')[3] country = taskcontent.split('&')[4] #room_type = taskcontent.split('&')[3] from_date_temp = taskcontent.split('&')[5] from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \ + from_date_temp[6:] to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \ int(from_date_temp[6:])) to_date = str(to_date_temp + datetime.timedelta(days = 1))[:10] #获取代理 p = get_proxy() #if p == "": #logger.error("get proxy failed") #return None hotel = Hotel() room = Room() rating_url = get_rating_url(hotel_id) rating_page = crawl_single_page(rating_url, proxy=p) grade_str = grade_parser(rating_page) if grade_str != '': hotel.grade = grade_str[:-1] else: pass #logger.error('Error: No grade_str found!') map_url = get_map_url(hotel_id) map_page = crawl_single_page(map_url, proxy=p) #print map_page map_info_list = staticmap_parser(map_page) if map_info_list != []: hotel.hotel_name = map_info_list[1] if is_alphabet(hotel.hotel_name.decode('utf-8')) == True: hotel.hotel_name_en = hotel.hotel_name else: hotel.hotel_name_en = 'NULL' hotel.map_info = map_info_list[0] else: logger.error('youzhanHotel: Map info do not have hotel name and map_info') return [] info_url = get_info_url(hotel_id,from_date,to_date) info_page = crawl_single_page(info_url,proxy=p) if info_page == '': #invalid_proxy(p) return [] info_list = info_parser(info_page) if info_list != []: hotel.country = country hotel.city = city hotel.address = info_list[1] hotel_desc_temp = info_list[3].replace('<br/>','').replace(''','') if hotel_desc_temp != '': hotel.description = hotel_desc_temp else: hotel.description = 'NULL' hotel.service = info_list[4] if '停车场' in hotel.service: hotel.has_parking = 'Yes' if '无线网络' in hotel.service or 'wifi' in hotel.service: hotel.has_wifi = 'Yes' else: return [] hotel.source = 'youzhan' hotel.source_id = hotel_id hotel.star = star price_url = get_price_url(hotel_id,ipathid,from_date,to_date) price_page = crawl_single_page(price_url,proxy=p) price_list = price_parser(price_page,hotel_id) #print '********' #print price_list if price_list != []: for each_room in price_list: if len(each_room) > 3: room.city = city room.occupancy = 2 room.hotel_name = hotel.hotel_name #print '******' #print each_room room.room_desc = each_room[3] room.real_source = each_room[2] num = each_room[3].find('-') if num > 0: if len(each_room[3][:num]) < 20: room.room_type = each_room[3][:num] else: room.room_type = 'NULL' else: if len(each_room[3]) < 20: room.room_type = each_room[3] else: room.room_type = 'NULL' if each_room[0] != u'nbsp;': room.price = each_room[0] room.has_breakfast = each_room[1] room.room_desc = each_room[3] if '免费WiFi' in room.room_desc: hotel.is_wifi_free = 'Yes' if '免费取消' in room.room_desc: hotel.is_cancel_free = 'Yes' room.currency = 'CNY' room.source = 'youzhan' room.source_hotelid = hotel_id room.check_in = from_date room.check_out = to_date room_tuple = (room.hotel_name,room.city,room.source,room.source_hotelid,\ room.source_roomid,room.real_source,room.room_type,room.occupancy,\ room.bed_type,room.size,room.floor,room.check_in,room.check_out,room.price,\ room.tax,room.currency,room.is_extrabed,room.is_extrabed_free,room.has_breakfast,\ room.is_breakfast_free,room.is_cancel_free,room.room_desc) room_list.append(room_tuple) hotel_tuple = (hotel.hotel_name, hotel.hotel_name_en,hotel.source,hotel.source_id,hotel.brand_name,\ hotel.map_info,hotel.address,hotel.city,hotel.country,hotel.postal_code, \ hotel.star,hotel.grade,hotel.has_wifi,hotel.is_wifi_free,hotel.has_parking,\ hotel.is_parking_free,hotel.service,hotel.img_items,hotel.description) hotel_list = [] hotel_list.append(hotel_tuple) all_info.append(hotel_list) all_info.append(room_list) return all_info
def youzhan_task_parser(taskcontent): all_info = [] room_list = [] taskcontent = taskcontent.encode('utf-8').strip() hotel_id = taskcontent.split('&')[0] star = taskcontent.split('&')[2] ipathid = taskcontent.split('&')[1] city = taskcontent.split('&')[3] country = taskcontent.split('&')[4] #room_type = taskcontent.split('&')[3] from_date_temp = taskcontent.split('&')[5] from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \ + from_date_temp[6:] to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \ int(from_date_temp[6:])) to_date = str(to_date_temp + datetime.timedelta(days=1))[:10] #获取代理 p = get_proxy() #if p == "": #logger.error("get proxy failed") #return None hotel = Hotel() room = Room() rating_url = get_rating_url(hotel_id) rating_page = crawl_single_page(rating_url, proxy=p) grade_str = grade_parser(rating_page) if grade_str != '': hotel.grade = grade_str[:-1] else: pass #logger.error('Error: No grade_str found!') map_url = get_map_url(hotel_id) map_page = crawl_single_page(map_url, proxy=p) #print map_page map_info_list = staticmap_parser(map_page) if map_info_list != []: hotel.hotel_name = map_info_list[1] if is_alphabet(hotel.hotel_name.decode('utf-8')) == True: hotel.hotel_name_en = hotel.hotel_name else: hotel.hotel_name_en = 'NULL' hotel.map_info = map_info_list[0] else: logger.error( 'youzhanHotel: Map info do not have hotel name and map_info') return [] info_url = get_info_url(hotel_id, from_date, to_date) info_page = crawl_single_page(info_url, proxy=p) if info_page == '': #invalid_proxy(p) return [] info_list = info_parser(info_page) if info_list != []: hotel.country = country hotel.city = city hotel.address = info_list[1] hotel_desc_temp = info_list[3].replace('<br/>', '').replace(''', '') if hotel_desc_temp != '': hotel.description = hotel_desc_temp else: hotel.description = 'NULL' hotel.service = info_list[4] if '停车场' in hotel.service: hotel.has_parking = 'Yes' if '无线网络' in hotel.service or 'wifi' in hotel.service: hotel.has_wifi = 'Yes' else: return [] hotel.source = 'youzhan' hotel.source_id = hotel_id hotel.star = star price_url = get_price_url(hotel_id, ipathid, from_date, to_date) price_page = crawl_single_page(price_url, proxy=p) price_list = price_parser(price_page, hotel_id) #print '********' #print price_list if price_list != []: for each_room in price_list: if len(each_room) > 3: room.city = city room.occupancy = 2 room.hotel_name = hotel.hotel_name #print '******' #print each_room room.room_desc = each_room[3] room.real_source = each_room[2] num = each_room[3].find('-') if num > 0: if len(each_room[3][:num]) < 20: room.room_type = each_room[3][:num] else: room.room_type = 'NULL' else: if len(each_room[3]) < 20: room.room_type = each_room[3] else: room.room_type = 'NULL' if each_room[0] != u'nbsp;': room.price = each_room[0] room.has_breakfast = each_room[1] room.room_desc = each_room[3] if '免费WiFi' in room.room_desc: hotel.is_wifi_free = 'Yes' if '免费取消' in room.room_desc: hotel.is_cancel_free = 'Yes' room.currency = 'CNY' room.source = 'youzhan' room.source_hotelid = hotel_id room.check_in = from_date room.check_out = to_date room_tuple = (room.hotel_name,room.city,room.source,room.source_hotelid,\ room.source_roomid,room.real_source,room.room_type,room.occupancy,\ room.bed_type,room.size,room.floor,room.check_in,room.check_out,room.price,\ room.tax,room.currency,room.is_extrabed,room.is_extrabed_free,room.has_breakfast,\ room.is_breakfast_free,room.is_cancel_free,room.room_desc) room_list.append(room_tuple) hotel_tuple = (hotel.hotel_name, hotel.hotel_name_en,hotel.source,hotel.source_id,hotel.brand_name,\ hotel.map_info,hotel.address,hotel.city,hotel.country,hotel.postal_code, \ hotel.star,hotel.grade,hotel.has_wifi,hotel.is_wifi_free,hotel.has_parking,\ hotel.is_parking_free,hotel.service,hotel.img_items,hotel.description) hotel_list = [] hotel_list.append(hotel_tuple) all_info.append(hotel_list) all_info.append(room_list) return all_info
return url def parseRoom(content,hotel_id,hotel_name,city,check_in,check_out): rooms = [] try: all_info = all_room_info_pat.findall(content)[0] each_room_info_list = each_room_info_pat.findall(all_info) print len(each_room_info_list) time.sleep(3) except Exception, e: logger.error('Can not parse rooms info!' + str(e)) return rooms for each_room_info in each_room_info_list: room = Room() room.hotel_name = hotel_name.replace('_',' ') room.city = city room.source_hotelid = hotel_id room.currency = 'CNY' room.source = 'biyi' room.check_in = check_in room.check_out = check_out try: room.real_source = real_source_pat.findall(each_room_info)[0] room.price = price_pat.findall(each_room_info)[0].replace(' ','').replace(',','') except Exception, e: #logger.error('Can not parse important info of this room! Detail: ' + str(e)) return rooms
def __init__(self): self.room = Room()
content = xmlescape(content_temp).replace('\n', '') #print content_temp try: each_hotel_content_list = rateplans_pat.findall(content) if len(each_hotel_content_list) == 0: return all_price _INFO('ctripHotel::parseRoom', ['Parse price failed because of no hotel found']) except Exception, e: _ERROR('ctripHotel::parseRoom', ['Parse price failed because of no hotel found', str(e)]) for each_hotel_content in each_hotel_content_list: room = Room() try: room.source_hotelid = pattern_search(hotelcode_pat, each_hotel_content) if room.source_hotelid == 'NULL': _INFO('ctripHotel::parseRoom', ['Cannot parse this hotel id']) except Exception, e: _ERROR( 'ctripHotel::parseRoom', ['Cannot parse this hotel', str(e)]) #print room.source_hotelid try: each_room_content_list = rateplan_pat.findall(each_hotel_content) if len(each_room_content_list) == 0:
hotel_name_real = hotel_name_en = hotel_name except: #logger.error('haodingHotel::Cannot parse hotel name') return room_list try: room_type_list = each_type_room_content_pat.findall(rooms_content) if len(room_type_list) == 0: return room_list except Exceprion, e: #logger.error('haodingHotel::Cannot parse rooms of this hotel [' + hotel_id + ']') logger.error('haodingHotel::' + str(e)) return room_list for each_type_room_content in room_type_list: room = Room() room.hotel_name = hotel_name_real room.city = city_name_zh room.source = 'hotels' room.source_hotelid = hotel_id room.real_source = 'hotels' room.currency = 'CNY' room.check_in = check_in room.check_out = check_out try: room_desc_temp = room_desc_pat.findall(each_type_room_content)[0].strip() room_desc_temp = '<' + room_desc_temp room.room_desc = re.sub('<.*?>','',room_desc_temp).replace('\n','').replace(' ',',') room.room_desc = room.room_desc.replace(',,','').replace(' ','').replace('。,','。')
hotel_id = infos[0] city_name = infos[1] ipathid = cities_dict[city_name.encode('utf-8')] #logger.info(ipathid) room_type = infos[2] checkin_date = infos[3].split('-')[0]#format:2014-05-05 checkout_date = infos[3].split('-')[1]#format:2014-05-06 real_source = infos[4].split('::')[-1] #logger.info('type' + room_type + ' source' + real_source) except Exception,e: logger.error('wrong content format' + str(e)) return -1 p = get_proxy() room = Room() price_url = get_price_url(hotel_id,ipathid,checkin_date,checkout_date) price_page = crawl_single_page(price_url,n=1,proxy=p) price_list = price_parser(price_page,hotel_id) result = 1000000#设置一个极大值 if price_list != []: for each_room in price_list: if len(each_room) > 3: #room.city = city #room.occupancy = 1 #room.hotel_name = hotel.hotel_name #print each_room #room.room_desc = each_room[3]
hotel_id = infos[0] city_name = infos[1] ipathid = cities_dict[city_name.encode('utf-8')] #logger.info(ipathid) room_type = infos[2] checkin_date = infos[3].split('-')[0] #format:2014-05-05 checkout_date = infos[3].split('-')[1] #format:2014-05-06 real_source = infos[4].split('::')[-1] #logger.info('type' + room_type + ' source' + real_source) except Exception, e: logger.error('wrong content format' + str(e)) return -1 p = get_proxy() room = Room() price_url = get_price_url(hotel_id, ipathid, checkin_date, checkout_date) price_page = crawl_single_page(price_url, n=1, proxy=p) price_list = price_parser(price_page, hotel_id) result = 1000000 #设置一个极大值 if price_list != []: for each_room in price_list: if len(each_room) > 3: #room.city = city #room.occupancy = 1 #room.hotel_name = hotel.hotel_name #print each_room #room.room_desc = each_room[3]