Пример #1
0
    def HtmlRoom(self, page, area, filename, list):
        
        Dict = defaultdict(dict)
        room = Room()
        for every in list:
            
            md5 = hashlib.md5(str(every).encode('utf-8')).hexdigest()         
            
            try:
                con_res = self.com_HDfunc(page, every, filename, area, 'hotel_name') 
                if con_res == 'error':
                    Dict[md5]['hotel_name'] = room.hotel_name
                else:
                    Dict[md5]['hotel_name'] = con_res
            except Exception, e:
                Dict[md5]['hotel_name'] = room.hotel_name

            try:
                con_res = self.com_HDfunc(page, every, filename, area, 'city') 
                if con_res == 'error':
                    Dict[md5]['city'] = room.city
                else:
                    Dict[md5]['city'] = con_res
            except Exception, e:
                Dict[md5]['city'] = room.city
Пример #2
0
def parseRoom(content, check_in, check_out, hotel_id):
    room_info = []
    room = Room()
    content = content.replace('\n', '')
    content = replace_char(content)
    if content == '' or len(content) < CONTENT_LEN:
        return room_info

    try:
        each_type_room_list = each_room_type_pat.findall(content)
        if len(each_type_room_list) == 0:
            return room_info
    except Exception, e:
        return room_info
Пример #3
0
    def JsonRoom(self,filename, list):
        
        Dict = defaultdict(dict)
        room = Room()
        for every in list:
            
            md5 = hashlib.md5(str(every).encode('utf-8')).hexdigest()         

            Dict[md5]['currency'] = self.GetConfValues(filename, 'section', 'currency')
            Dict[md5]['source'] = self.GetConfValues(filename, 'section', 'source')
            
            try:
                roomtypepath = self.GetConfValues(filename, 'section', 'room_type')
                Dict[md5]['room_type'] = self.GetUnder(every, roomtypepath)
            except Exception, e:
                Dict[md5]['room_type'] = room.room_type
            
            try:
                hotel_namepath = self.GetConfValues(filename, 'section', 'hotel_name')
                Dict[md5]['hotel_name'] = self.GetUnder(every, hotel_namepath)
            except Exception, e:
                Dict[md5]['hotel_name'] = room.hotel_name 
Пример #4
0

def parseRoom(content, hotel_name, city_name_zh, check_in, check_out,
              hotel_id):
    room_list = []
    if content == '' or len(content) < 100:
        return room_list

    try:
        content_json = json.loads(content)['value']['hotelRoomList']
    except Exception, e:
        logger.info('elongHotelParser: Cannot load json' + str(e))
        return room_list

    for each_hotel in content_json:
        room = Room()

        try:
            room_type = str(each_hotel['RoomName'])
            num_temp1 = room_type.find(',')
            if num_temp1 > 0:
                room.room_type = room_type[:num_temp1]
            else:
                room.room_type = room_type
        except Exception, e:
            logger.error('Cannot paese room type of this hotel!' + str(e))

        try:
            room.source_roomid = each_hotel['RoomId']
        except Exception, e:
            logger.info('Cannot parse this room id with error: ' + str(e))
Пример #5
0
    return url


def parseRoom(content, hotel_id, hotel_name, city, check_in, check_out):
    rooms = []
    try:
        all_info = all_room_info_pat.findall(content)[0]
        each_room_info_list = each_room_info_pat.findall(all_info)
        print len(each_room_info_list)
        time.sleep(3)
    except Exception, e:
        logger.error('Can not parse rooms info!' + str(e))
        return rooms

    for each_room_info in each_room_info_list:
        room = Room()
        room.hotel_name = hotel_name.replace('_', ' ')
        room.city = city
        room.source_hotelid = hotel_id
        room.currency = 'CNY'
        room.source = 'biyi'
        room.check_in = check_in
        room.check_out = check_out

        try:
            room.real_source = real_source_pat.findall(each_room_info)[0]
            room.price = price_pat.findall(each_room_info)[0].replace(
                ' ', '').replace(',', '')
        except Exception, e:
            #logger.error('Can not parse important info of this room! Detail: ' + str(e))
            return rooms
Пример #6
0
        star = taskcontent.split('&')[2]
        ipathid = taskcontent.split('&')[1]
        city = taskcontent.split('&')[3]
        country = taskcontent.split('&')[4]
        from_date_temp = taskcontent.split('&')[5]
        from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \
                    + from_date_temp[6:]
        to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \
                                         int(from_date_temp[6:8]))
        to_date = str(to_date_temp + datetime.timedelta(days = 1))[:10]
    except Exception,e:
        logger.info('youzhanHotel: Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result

    room = Room()

    price_url = get_price_url(hotel_id,ipathid,from_date,to_date)
    i = 0
    content_len = 0
    while i < 5 and content_len < CONTENT_LEN:
        #p = get_proxy()
        p = get_proxy(source='youzhanHotel')
        #print p
        if p == None:
            result['error'] = PROXY_NONE
            return result

        url = price_url + str(int(time.time() * 1000))
        price_page = crawl_single_page(url,proxy=p,n=1)
        content_len = len(price_page)
Пример #7
0
    return result
    

def parseRoom(content,hotel_name,city_name_zh,check_in,check_out,hotel_id):
    room_list = []
    if content == '' or len(content) < 100:
        return room_list

    try:
        content_json = json.loads(content)['value']['hotelRoomList']
    except Exception, e:
        logger.info('elongHotelParser: Cannot load json' + str(e))
        return room_list
    
    for each_hotel in content_json:
        room = Room()

        try:
            room_type = str(each_hotel['RoomName'])
            num_temp1 = room_type.find(',')
            if num_temp1 > 0:
                room.room_type = room_type[:num_temp1]
            else:
                room.room_type = room_type
        except Exception,e:
            logger.error('Cannot paese room type of this hotel!' + str(e))

        try:
            room.source_roomid = each_hotel['RoomId']
        except Exception, e:
            logger.info('Cannot parse this room id with error: ' + str(e))
Пример #8
0
def youzhan_task_parser(taskcontent):
    all_info = []
    room_list = []
    taskcontent = taskcontent.encode('utf-8').strip()
    hotel_id = taskcontent.split('&')[0]
    star = taskcontent.split('&')[2]
    ipathid = taskcontent.split('&')[1]
    city = taskcontent.split('&')[3]
    country = taskcontent.split('&')[4]
    #room_type = taskcontent.split('&')[3]
    from_date_temp = taskcontent.split('&')[5]
    from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \
                + from_date_temp[6:]
    to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \
                                     int(from_date_temp[6:]))
    to_date = str(to_date_temp + datetime.timedelta(days = 1))[:10]

    #获取代理
    
    p = get_proxy()

    #if p == "":
        #logger.error("get proxy failed")
        #return None
    
    hotel = Hotel()
    room = Room()

    rating_url = get_rating_url(hotel_id)
    rating_page = crawl_single_page(rating_url, proxy=p)
    
    grade_str = grade_parser(rating_page)
    
    if grade_str != '':
        hotel.grade = grade_str[:-1]
    else:
        pass
        #logger.error('Error: No grade_str found!')

    map_url = get_map_url(hotel_id)
    map_page = crawl_single_page(map_url, proxy=p)
    #print map_page
    map_info_list = staticmap_parser(map_page)
    if map_info_list != []:
        hotel.hotel_name = map_info_list[1]
        if is_alphabet(hotel.hotel_name.decode('utf-8')) == True:
            hotel.hotel_name_en = hotel.hotel_name
        else:
            hotel.hotel_name_en = 'NULL'
        hotel.map_info = map_info_list[0]
    else:
        logger.error('youzhanHotel: Map info do not have hotel name and map_info')
        return []

    info_url = get_info_url(hotel_id,from_date,to_date)
    info_page = crawl_single_page(info_url,proxy=p)
    if info_page == '':
        #invalid_proxy(p)
        return []
    info_list = info_parser(info_page)

    if info_list != []:
        hotel.country = country
        hotel.city = city
        hotel.address = info_list[1]
        hotel_desc_temp = info_list[3].replace('&lt;br/&gt;','').replace('&#039;','')
        if hotel_desc_temp != '':
            hotel.description = hotel_desc_temp
        else:
            hotel.description = 'NULL'
        hotel.service = info_list[4]

        if '停车场' in hotel.service:
            hotel.has_parking = 'Yes'
        if '无线网络' in hotel.service or 'wifi' in hotel.service:
            hotel.has_wifi = 'Yes'
    else:
        return []

    hotel.source = 'youzhan'
    hotel.source_id = hotel_id
    hotel.star = star

    price_url = get_price_url(hotel_id,ipathid,from_date,to_date)
    price_page = crawl_single_page(price_url,proxy=p)
    price_list = price_parser(price_page,hotel_id)
    #print '********'
    #print price_list
    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                room.city = city
                room.occupancy = 2
                room.hotel_name = hotel.hotel_name
                #print '******'
                #print each_room
                room.room_desc = each_room[3]
                room.real_source = each_room[2]
                

                num = each_room[3].find('-')
                if num > 0:
                    if len(each_room[3][:num]) < 20:
                        room.room_type = each_room[3][:num]
                    else:
                        room.room_type = 'NULL'
                else:
                    if len(each_room[3]) < 20:
                        room.room_type = each_room[3]
                    else:
                        room.room_type = 'NULL'
            
                if each_room[0] != u'nbsp;':
                    room.price = each_room[0]
                room.has_breakfast = each_room[1]
                room.room_desc = each_room[3]

                if '免费WiFi' in room.room_desc:
                    hotel.is_wifi_free = 'Yes'
                
                if '免费取消' in room.room_desc:
                    hotel.is_cancel_free = 'Yes'

                room.currency = 'CNY'
                room.source = 'youzhan'
                room.source_hotelid = hotel_id
                room.check_in = from_date
                room.check_out = to_date

                room_tuple = (room.hotel_name,room.city,room.source,room.source_hotelid,\
                    room.source_roomid,room.real_source,room.room_type,room.occupancy,\
                    room.bed_type,room.size,room.floor,room.check_in,room.check_out,room.price,\
                    room.tax,room.currency,room.is_extrabed,room.is_extrabed_free,room.has_breakfast,\
                    room.is_breakfast_free,room.is_cancel_free,room.room_desc)
                room_list.append(room_tuple)

    hotel_tuple = (hotel.hotel_name, hotel.hotel_name_en,hotel.source,hotel.source_id,hotel.brand_name,\
        hotel.map_info,hotel.address,hotel.city,hotel.country,hotel.postal_code, \
        hotel.star,hotel.grade,hotel.has_wifi,hotel.is_wifi_free,hotel.has_parking,\
        hotel.is_parking_free,hotel.service,hotel.img_items,hotel.description)
    hotel_list = []
    hotel_list.append(hotel_tuple)
    all_info.append(hotel_list)
    all_info.append(room_list)

    return all_info
Пример #9
0
def youzhan_task_parser(taskcontent):
    all_info = []
    room_list = []
    taskcontent = taskcontent.encode('utf-8').strip()
    hotel_id = taskcontent.split('&')[0]
    star = taskcontent.split('&')[2]
    ipathid = taskcontent.split('&')[1]
    city = taskcontent.split('&')[3]
    country = taskcontent.split('&')[4]
    #room_type = taskcontent.split('&')[3]
    from_date_temp = taskcontent.split('&')[5]
    from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \
                + from_date_temp[6:]
    to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \
                                     int(from_date_temp[6:]))
    to_date = str(to_date_temp + datetime.timedelta(days=1))[:10]

    #获取代理

    p = get_proxy()

    #if p == "":
    #logger.error("get proxy failed")
    #return None

    hotel = Hotel()
    room = Room()

    rating_url = get_rating_url(hotel_id)
    rating_page = crawl_single_page(rating_url, proxy=p)

    grade_str = grade_parser(rating_page)

    if grade_str != '':
        hotel.grade = grade_str[:-1]
    else:
        pass
        #logger.error('Error: No grade_str found!')

    map_url = get_map_url(hotel_id)
    map_page = crawl_single_page(map_url, proxy=p)
    #print map_page
    map_info_list = staticmap_parser(map_page)
    if map_info_list != []:
        hotel.hotel_name = map_info_list[1]
        if is_alphabet(hotel.hotel_name.decode('utf-8')) == True:
            hotel.hotel_name_en = hotel.hotel_name
        else:
            hotel.hotel_name_en = 'NULL'
        hotel.map_info = map_info_list[0]
    else:
        logger.error(
            'youzhanHotel: Map info do not have hotel name and map_info')
        return []

    info_url = get_info_url(hotel_id, from_date, to_date)
    info_page = crawl_single_page(info_url, proxy=p)
    if info_page == '':
        #invalid_proxy(p)
        return []
    info_list = info_parser(info_page)

    if info_list != []:
        hotel.country = country
        hotel.city = city
        hotel.address = info_list[1]
        hotel_desc_temp = info_list[3].replace('&lt;br/&gt;',
                                               '').replace('&#039;', '')
        if hotel_desc_temp != '':
            hotel.description = hotel_desc_temp
        else:
            hotel.description = 'NULL'
        hotel.service = info_list[4]

        if '停车场' in hotel.service:
            hotel.has_parking = 'Yes'
        if '无线网络' in hotel.service or 'wifi' in hotel.service:
            hotel.has_wifi = 'Yes'
    else:
        return []

    hotel.source = 'youzhan'
    hotel.source_id = hotel_id
    hotel.star = star

    price_url = get_price_url(hotel_id, ipathid, from_date, to_date)
    price_page = crawl_single_page(price_url, proxy=p)
    price_list = price_parser(price_page, hotel_id)
    #print '********'
    #print price_list
    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                room.city = city
                room.occupancy = 2
                room.hotel_name = hotel.hotel_name
                #print '******'
                #print each_room
                room.room_desc = each_room[3]
                room.real_source = each_room[2]

                num = each_room[3].find('-')
                if num > 0:
                    if len(each_room[3][:num]) < 20:
                        room.room_type = each_room[3][:num]
                    else:
                        room.room_type = 'NULL'
                else:
                    if len(each_room[3]) < 20:
                        room.room_type = each_room[3]
                    else:
                        room.room_type = 'NULL'

                if each_room[0] != u'nbsp;':
                    room.price = each_room[0]
                room.has_breakfast = each_room[1]
                room.room_desc = each_room[3]

                if '免费WiFi' in room.room_desc:
                    hotel.is_wifi_free = 'Yes'

                if '免费取消' in room.room_desc:
                    hotel.is_cancel_free = 'Yes'

                room.currency = 'CNY'
                room.source = 'youzhan'
                room.source_hotelid = hotel_id
                room.check_in = from_date
                room.check_out = to_date

                room_tuple = (room.hotel_name,room.city,room.source,room.source_hotelid,\
                    room.source_roomid,room.real_source,room.room_type,room.occupancy,\
                    room.bed_type,room.size,room.floor,room.check_in,room.check_out,room.price,\
                    room.tax,room.currency,room.is_extrabed,room.is_extrabed_free,room.has_breakfast,\
                    room.is_breakfast_free,room.is_cancel_free,room.room_desc)
                room_list.append(room_tuple)

    hotel_tuple = (hotel.hotel_name, hotel.hotel_name_en,hotel.source,hotel.source_id,hotel.brand_name,\
        hotel.map_info,hotel.address,hotel.city,hotel.country,hotel.postal_code, \
        hotel.star,hotel.grade,hotel.has_wifi,hotel.is_wifi_free,hotel.has_parking,\
        hotel.is_parking_free,hotel.service,hotel.img_items,hotel.description)
    hotel_list = []
    hotel_list.append(hotel_tuple)
    all_info.append(hotel_list)
    all_info.append(room_list)

    return all_info
Пример #10
0
    return url


def parseRoom(content,hotel_id,hotel_name,city,check_in,check_out):
    rooms = []
    try:
        all_info = all_room_info_pat.findall(content)[0]
        each_room_info_list = each_room_info_pat.findall(all_info)
        print len(each_room_info_list)
        time.sleep(3)
    except Exception, e:
        logger.error('Can not parse rooms info!' + str(e))
        return rooms

    for each_room_info in each_room_info_list:
        room = Room()
        room.hotel_name = hotel_name.replace('_',' ')
        room.city = city
        room.source_hotelid = hotel_id
        room.currency = 'CNY'
        room.source = 'biyi'
        room.check_in = check_in
        room.check_out = check_out
        
        try:
            room.real_source = real_source_pat.findall(each_room_info)[0]
            room.price = price_pat.findall(each_room_info)[0].replace(' ','').replace(',','')
        except Exception, e:
            #logger.error('Can not parse important info of this room! Detail: ' + str(e))
            return rooms
        
Пример #11
0
 def __init__(self):
     self.room = Room()
Пример #12
0
    content = xmlescape(content_temp).replace('\n', '')

    #print content_temp
    try:
        each_hotel_content_list = rateplans_pat.findall(content)
        if len(each_hotel_content_list) == 0:
            return all_price
        _INFO('ctripHotel::parseRoom',
              ['Parse price failed because of no hotel found'])
    except Exception, e:
        _ERROR('ctripHotel::parseRoom',
               ['Parse price failed because of no hotel found',
                str(e)])

    for each_hotel_content in each_hotel_content_list:
        room = Room()

        try:
            room.source_hotelid = pattern_search(hotelcode_pat,
                                                 each_hotel_content)
            if room.source_hotelid == 'NULL':
                _INFO('ctripHotel::parseRoom', ['Cannot parse this hotel id'])
        except Exception, e:
            _ERROR(
                'ctripHotel::parseRoom',
                ['Cannot parse this hotel', str(e)])
        #print room.source_hotelid

        try:
            each_room_content_list = rateplan_pat.findall(each_hotel_content)
            if len(each_room_content_list) == 0:
Пример #13
0
            hotel_name_real = hotel_name_en = hotel_name
    except:
        #logger.error('haodingHotel::Cannot parse hotel name')
        return room_list

    try:
        room_type_list = each_type_room_content_pat.findall(rooms_content)
        if len(room_type_list) == 0:
            return room_list
    except Exceprion, e:
        #logger.error('haodingHotel::Cannot parse rooms of this hotel [' + hotel_id + ']')
        logger.error('haodingHotel::' + str(e))
        return room_list

    for each_type_room_content in room_type_list:
        room = Room()

        room.hotel_name = hotel_name_real
        room.city = city_name_zh
        room.source = 'hotels'
        room.source_hotelid = hotel_id
        room.real_source = 'hotels'
        room.currency = 'CNY'
        room.check_in = check_in
        room.check_out = check_out

        try:
            room_desc_temp = room_desc_pat.findall(each_type_room_content)[0].strip()
            room_desc_temp = '<' + room_desc_temp
            room.room_desc = re.sub('<.*?>','',room_desc_temp).replace('\n','').replace(' ',',')
            room.room_desc = room.room_desc.replace(',,','').replace(' ','').replace('。,','。')
Пример #14
0
        hotel_id = infos[0]
        city_name = infos[1]
        ipathid = cities_dict[city_name.encode('utf-8')]
        #logger.info(ipathid)
        room_type = infos[2]
        checkin_date = infos[3].split('-')[0]#format:2014-05-05
        checkout_date = infos[3].split('-')[1]#format:2014-05-06
        real_source = infos[4].split('::')[-1]
        #logger.info('type' + room_type + ' source' + real_source)
    except Exception,e:
        logger.error('wrong content format' + str(e))
        return -1
    
    p = get_proxy()
    
    room = Room()

    price_url = get_price_url(hotel_id,ipathid,checkin_date,checkout_date)
    price_page = crawl_single_page(price_url,n=1,proxy=p)
    price_list = price_parser(price_page,hotel_id)

    result = 1000000#设置一个极大值

    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                #room.city = city
                #room.occupancy = 1
                #room.hotel_name = hotel.hotel_name
                #print each_room
                #room.room_desc = each_room[3]
Пример #15
0
        hotel_id = infos[0]
        city_name = infos[1]
        ipathid = cities_dict[city_name.encode('utf-8')]
        #logger.info(ipathid)
        room_type = infos[2]
        checkin_date = infos[3].split('-')[0]  #format:2014-05-05
        checkout_date = infos[3].split('-')[1]  #format:2014-05-06
        real_source = infos[4].split('::')[-1]
        #logger.info('type' + room_type + ' source' + real_source)
    except Exception, e:
        logger.error('wrong content format' + str(e))
        return -1

    p = get_proxy()

    room = Room()

    price_url = get_price_url(hotel_id, ipathid, checkin_date, checkout_date)
    price_page = crawl_single_page(price_url, n=1, proxy=p)
    price_list = price_parser(price_page, hotel_id)

    result = 1000000  #设置一个极大值

    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                #room.city = city
                #room.occupancy = 1
                #room.hotel_name = hotel.hotel_name
                #print each_room
                #room.room_desc = each_room[3]