def parse_local_page(self, response): """Parse hotel/restaurant/spot page in different language.""" current_lang = response.meta['remain'][0] remain = response.meta['remain'][1:] sel = Selector(response) item = response.meta['item'] place = { 'lang': current_lang, 'name': sel.xpath(helper.SEL_HOTEL_NAME).extract(), 'address_area_name': sel.xpath(helper.SEL_AREA_NAME).extract(), 'address_street': sel.xpath(helper.SEL_AREA_STREET).extract(), 'address_locality': sel.xpath(helper.SEL_AREA_LOCALITY).extract(), 'address_region': sel.xpath(helper.SEL_AREA_REGION).extract(), 'address_zip': sel.xpath(helper.SEL_AREA_ZIP).extract(), 'amenity': sel.xpath(helper.SEL_AMENITIES).extract(), 'page_body': helper.get_body(sel) } item['place'].append(place) if remain and len(remain) > 0: from ghost_spider.settings import REQUEST_HEADERS next_lang = remain[0] request = Request(response.meta['links'][next_lang], headers=REQUEST_HEADERS[next_lang], callback=self.parse_local_page, errback=self.parse_err) request.meta['remain'] = remain request.meta['links'] = response.meta['links'] request.meta['item'] = item return request return item
def parse_place(self, response): if response.meta.get('area_name'): print "%s> %s" % ("-----" * response.meta.get('area_level') or 1, response.meta['area_name']) sel = Selector(response) item = HotelItem() item['page_url'] = response.url item['page_breadcrumbs'] = sel.xpath(helper.SEL_BREADCRUMBS).extract() item['name'] = sel.xpath(helper.SEL_HOTEL_NAME).extract() item['phone'] = sel.xpath(helper.SEL_PHONE_NUMBER).extract() item['address_area_name'] = sel.xpath(helper.SEL_AREA_NAME).extract() item['address_street'] = sel.xpath(helper.SEL_AREA_STREET).extract() item['address_locality'] = sel.xpath(helper.SEL_AREA_LOCALITY).extract() item['address_region'] = sel.xpath(helper.SEL_AREA_REGION).extract() item['address_zip'] = sel.xpath(helper.SEL_AREA_ZIP).extract() item['amenity'] = sel.xpath(helper.SEL_AMENITIES).extract() item['rating'] = sel.xpath(helper.SEL_RATING).re(r'(.*)\s*of 5') item['popularity'] = sel.xpath(helper.SEL_PERCENT).re(r'(.*)\s*%') item['page_body'] = helper.get_body(sel) links = { 'es': sel.xpath(helper.SEL_SPANISH_PAGE).extract(), 'ja': sel.xpath(helper.SEL_JAPANESE_PAGE).extract(), 'zh': sel.xpath(helper.SEL_CHINESE_PAGE).extract() } for name, link in links.iteritems(): links[name] = link[0] request = Request(links['ja'], callback=self.parse_local_page) request.meta['remain'] = ['ja', 'es', 'zh'] request.meta['links'] = links request.meta['item'] = item return request
def parse_place(self, response): """Parse hotel/restaurant/spot page.""" if response.meta.get('area_name') and self.log: self.log.msg(u'%s> %s' % ("-----" * response.meta.get('area_level') or 1, response.meta['area_name']), level=self.log.INFO) sel = Selector(response) item = HotelItem() item['page_url'] = response.url item['page_breadcrumbs'] = sel.xpath(helper.SEL_BREADCRUMBS).extract() item['name'] = sel.xpath(helper.SEL_HOTEL_NAME).extract() item['phone'] = sel.xpath(helper.SEL_PHONE_NUMBER).extract() item['rating'] = sel.xpath(helper.SEL_RATING).re(r'(.*)\s*of 5') item['popularity'] = sel.xpath(helper.SEL_PERCENT).re(r'(.*)\s*%') item['region'] = sel.xpath(helper.SEL_AREA_REGION).extract() place = { 'lang': 'en', 'name': item['name'], 'address_area_name': sel.xpath(helper.SEL_AREA_NAME).extract(), 'address_street': sel.xpath(helper.SEL_AREA_STREET).extract(), 'address_locality': sel.xpath(helper.SEL_AREA_LOCALITY).extract(), 'address_region': sel.xpath(helper.SEL_AREA_REGION).extract(), 'address_zip': sel.xpath(helper.SEL_AREA_ZIP).extract(), 'amenity': sel.xpath(helper.SEL_AMENITIES).extract(), 'page_body': helper.get_body(sel) } # save list of places by language item['place'] = [place] links = { 'ja': sel.xpath(helper.SEL_JAPANESE_PAGE).extract(), } remain = ['ja'] if self.need_french_page(item['page_breadcrumbs']): links['fr'] = sel.xpath(helper.SEL_FRENCH_PAGE).extract() remain.append('fr') elif self.need_spanish_page(item['page_breadcrumbs']): links['es'] = sel.xpath(helper.SEL_SPANISH_PAGE).extract() remain.append('es') for name, link in links.iteritems(): if not link: self.log.error("couldn't index this page | %s" % response.url) return None links[name] = link[0] request = Request(links['ja'], callback=self.parse_local_page) request.meta['remain'] = remain request.meta['links'] = links request.meta['item'] = item return request
def parse_local_page(self, response): current = response.meta['remain'][0] remain = response.meta['remain'][1:] sel = Selector(response) item = response.meta['item'] item['name_%s' % current] = sel.xpath(helper.SEL_HOTEL_NAME).extract() item['address_area_name_%s' % current] = sel.xpath(helper.SEL_AREA_NAME).extract() item['address_street_%s' % current] = sel.xpath(helper.SEL_AREA_STREET).extract() item['address_locality_%s' % current] = sel.xpath(helper.SEL_AREA_LOCALITY).extract() item['address_region_%s' % current] = sel.xpath(helper.SEL_AREA_REGION).extract() item['address_zip_%s' % current] = sel.xpath(helper.SEL_AREA_ZIP).extract() item['amenity_%s' % current] = sel.xpath(helper.SEL_AMENITIES).extract() item['page_body_%s' % current] = helper.get_body(sel) if remain and len(remain) > 0: from ghost_spider.settings import REQUEST_HEADERS next_lang = remain[0] request = Request(response.meta['links'][next_lang], headers=REQUEST_HEADERS[next_lang], callback=self.parse_local_page, errback=self.parse_err) request.meta['remain'] = remain request.meta['links'] = response.meta['links'] request.meta['item'] = item return request return item