Пример #1
0
 def parse_local_page(self, response):
   """Parse hotel/restaurant/spot page in different language."""
   current_lang = response.meta['remain'][0]
   remain = response.meta['remain'][1:]
   sel = Selector(response)
   item = response.meta['item']
   place = {
     'lang': current_lang,
     'name': sel.xpath(helper.SEL_HOTEL_NAME).extract(),
     'address_area_name': sel.xpath(helper.SEL_AREA_NAME).extract(),
     'address_street': sel.xpath(helper.SEL_AREA_STREET).extract(),
     'address_locality': sel.xpath(helper.SEL_AREA_LOCALITY).extract(),
     'address_region': sel.xpath(helper.SEL_AREA_REGION).extract(),
     'address_zip': sel.xpath(helper.SEL_AREA_ZIP).extract(),
     'amenity': sel.xpath(helper.SEL_AMENITIES).extract(),
     'page_body': helper.get_body(sel)
   }
   item['place'].append(place)
   if remain and len(remain) > 0:
     from ghost_spider.settings import REQUEST_HEADERS
     next_lang = remain[0]
     request = Request(response.meta['links'][next_lang], headers=REQUEST_HEADERS[next_lang], callback=self.parse_local_page, errback=self.parse_err)
     request.meta['remain'] = remain
     request.meta['links'] = response.meta['links']
     request.meta['item'] = item
     return request
   return item
Пример #2
0
  def parse_place(self, response):
    if response.meta.get('area_name'):
      print "%s> %s" % ("-----" * response.meta.get('area_level') or 1, response.meta['area_name'])
    sel = Selector(response)
    item = HotelItem()
    item['page_url'] = response.url
    item['page_breadcrumbs'] = sel.xpath(helper.SEL_BREADCRUMBS).extract()
    item['name'] = sel.xpath(helper.SEL_HOTEL_NAME).extract()
    item['phone'] = sel.xpath(helper.SEL_PHONE_NUMBER).extract()
    item['address_area_name'] = sel.xpath(helper.SEL_AREA_NAME).extract()
    item['address_street'] = sel.xpath(helper.SEL_AREA_STREET).extract()
    item['address_locality'] = sel.xpath(helper.SEL_AREA_LOCALITY).extract()
    item['address_region'] = sel.xpath(helper.SEL_AREA_REGION).extract()
    item['address_zip'] = sel.xpath(helper.SEL_AREA_ZIP).extract()
    item['amenity'] = sel.xpath(helper.SEL_AMENITIES).extract()
    item['rating'] = sel.xpath(helper.SEL_RATING).re(r'(.*)\s*of 5')
    item['popularity'] = sel.xpath(helper.SEL_PERCENT).re(r'(.*)\s*%')
    item['page_body'] = helper.get_body(sel)
    links = {
      'es': sel.xpath(helper.SEL_SPANISH_PAGE).extract(),
      'ja': sel.xpath(helper.SEL_JAPANESE_PAGE).extract(),
      'zh': sel.xpath(helper.SEL_CHINESE_PAGE).extract()
    }

    for name, link in links.iteritems():
      links[name] = link[0]
    request = Request(links['ja'], callback=self.parse_local_page)
    request.meta['remain'] = ['ja', 'es', 'zh']
    request.meta['links'] = links
    request.meta['item'] = item
    return request
Пример #3
0
  def parse_place(self, response):
    """Parse hotel/restaurant/spot page."""
    if response.meta.get('area_name') and self.log:
      self.log.msg(u'%s> %s' % ("-----" * response.meta.get('area_level') or 1, response.meta['area_name']), level=self.log.INFO)
    sel = Selector(response)
    item = HotelItem()

    item['page_url'] = response.url
    item['page_breadcrumbs'] = sel.xpath(helper.SEL_BREADCRUMBS).extract()
    item['name'] = sel.xpath(helper.SEL_HOTEL_NAME).extract()
    item['phone'] = sel.xpath(helper.SEL_PHONE_NUMBER).extract()
    item['rating'] = sel.xpath(helper.SEL_RATING).re(r'(.*)\s*of 5')
    item['popularity'] = sel.xpath(helper.SEL_PERCENT).re(r'(.*)\s*%')
    item['region'] = sel.xpath(helper.SEL_AREA_REGION).extract()
    place = {
      'lang': 'en',
      'name': item['name'],
      'address_area_name': sel.xpath(helper.SEL_AREA_NAME).extract(),
      'address_street': sel.xpath(helper.SEL_AREA_STREET).extract(),
      'address_locality': sel.xpath(helper.SEL_AREA_LOCALITY).extract(),
      'address_region': sel.xpath(helper.SEL_AREA_REGION).extract(),
      'address_zip': sel.xpath(helper.SEL_AREA_ZIP).extract(),
      'amenity': sel.xpath(helper.SEL_AMENITIES).extract(),
      'page_body': helper.get_body(sel)
    }
    # save list of places by language
    item['place'] = [place]

    links = {
      'ja': sel.xpath(helper.SEL_JAPANESE_PAGE).extract(),
    }
    remain = ['ja']
    if self.need_french_page(item['page_breadcrumbs']):
      links['fr'] = sel.xpath(helper.SEL_FRENCH_PAGE).extract()
      remain.append('fr')
    elif self.need_spanish_page(item['page_breadcrumbs']):
      links['es'] = sel.xpath(helper.SEL_SPANISH_PAGE).extract()
      remain.append('es')

    for name, link in links.iteritems():
      if not link:
        self.log.error("couldn't index this page | %s" % response.url)
        return None
      links[name] = link[0]
    request = Request(links['ja'], callback=self.parse_local_page)
    request.meta['remain'] = remain
    request.meta['links'] = links
    request.meta['item'] = item
    return request
Пример #4
0
 def parse_local_page(self, response):
   current = response.meta['remain'][0]
   remain = response.meta['remain'][1:]
   sel = Selector(response)
   item = response.meta['item']
   item['name_%s' % current] = sel.xpath(helper.SEL_HOTEL_NAME).extract()
   item['address_area_name_%s' % current] = sel.xpath(helper.SEL_AREA_NAME).extract()
   item['address_street_%s' % current] = sel.xpath(helper.SEL_AREA_STREET).extract()
   item['address_locality_%s' % current] = sel.xpath(helper.SEL_AREA_LOCALITY).extract()
   item['address_region_%s' % current] = sel.xpath(helper.SEL_AREA_REGION).extract()
   item['address_zip_%s' % current] = sel.xpath(helper.SEL_AREA_ZIP).extract()
   item['amenity_%s' % current] = sel.xpath(helper.SEL_AMENITIES).extract()
   item['page_body_%s' % current] = helper.get_body(sel)
   if remain and len(remain) > 0:
     from ghost_spider.settings import REQUEST_HEADERS
     next_lang = remain[0]
     request = Request(response.meta['links'][next_lang], headers=REQUEST_HEADERS[next_lang], callback=self.parse_local_page, errback=self.parse_err)
     request.meta['remain'] = remain
     request.meta['links'] = response.meta['links']
     request.meta['item'] = item
     return request
   return item