예제 #1
0
  def parse(self, response):
    sel = Selector(response)
    links = sel.xpath(SalonSelectors.LIST_SALONS).extract()
    next_page = self.get_property(sel, SalonSelectors.NEXT_URL)
    print u'links: %s, %s' % (len(links), response.url)
    if SalonSelectors.is_first_page(sel):
      total = SalonSelectors.get_list_total(sel)
      if total > 999:
        # yahoo search can not paginate beyond 1000 items
        # so need to run crawler for smaller areas
        self.log_message(u'Pagination overflow: %s' % response.url)
    if links:
      for link in links:
        canonical = link.split('?')[0]
        if SalonEs.check_by_url(canonical):
          self.count_skip += 1
          print u'%s: skipped: %s' % (self.count_skip, link)
          continue
        request = Request(link, callback=self.parse_salon, errback=self.parse_err)
        request.meta['page_kind'] = 'salon'
        yield request

    if next_page:
      request = Request(next_page, callback=self.parse, errback=self.parse_err)
      request.meta['page_kind'] = 'list'
      yield request
예제 #2
0
  def parse_salon(self, response):
    sel = Selector(response)
    item = SalonItem()
    item['page_url'] = self.get_property(sel, SalonSelectors.CANONICAL_URL) or response.url
    item['name'] = self.get_property(sel, SalonSelectors.NAME)
    item['name_kata'] = self.get_property(sel, SalonSelectors.NAME_KATA)
    item['address'] = self.get_property(sel, SalonSelectors.ADDRESS, clean=True)
    item['routes'] = SalonSelectors.get_routes(sel)
    item['phone'] = SalonSelectors.get_phone(sel)
    item['working_hours'] = SalonSelectors.get_working_hours(sel)
    item['holydays'] = SalonSelectors.get_holidays(sel)
    item['shop_url'] = SalonSelectors.get_shop_url(sel)

    comment, credit_cards = SalonSelectors.get_credit_cards(sel)
    item['credit_cards_comment'] = comment
    item['credit_cards'] = credit_cards

    item['seats'] = SalonSelectors.get_seats(sel)
    item['stylist'] = SalonSelectors.get_stylist(sel)
    item['parking'] = SalonSelectors.get_parking(sel)

    item['cut_price'] = SalonSelectors.get_cut_price(sel)
    prefecture, area = SalonSelectors.get_prefecture_area(sel)

    item['prefecture'] = prefecture
    item['area'] = area

    item['page_body'] = SalonSelectors.get_body(sel)
    self.count += 1
    print u'%s: %s > %s -> %s' % (self.count, item['prefecture'], item['area'], item['name'])
    return item