예제 #1
0
  def dump_hotel(cls, name, action='normal'):
    from ghost_spider.elastic import LocationHotelEs, LatteHotelEs
    from ghost_spider import progressbar

    filename = cls.get_filename_by_name(name)
    query = {"query": {"bool": {"must": [{"term": {"prefecture_ascii": name}}], "must_not": []}}}

    if action == 'recover':
      query["query"]["bool"]["must"].append({"term": {"recovered": "1"}})
      filename = cls.get_filename_by_name(u'%s_recover' % name)
    elif action == 'production':
      filename = cls.get_filename_by_name(u'%s_production' % name)
      query["query"]["bool"]["must"].append({"term": {"version": 10}})

    query["query"]["bool"]["must_not"].append({"term": {"genre": u'ラブホテル'}})

    if os.path.exists(filename):
      os.remove(filename)

    progress = None
    total = 0
    page = 1
    limit = 100
    sort = [{"area.untouched": "asc"}]

    save_data_to_file = cls.save_for_production if action == u'production' else cls.save_to_csv

    print "=" * 100
    while True:
      places, total = LocationHotelEs.pager(query=query, page=page, size=limit, sort=sort)
      page += 1
      if not places or not len(places):
        break
      if not progress:
        print "Dumping data for %s (%s)" % (name, total)
        progress = progressbar.AnimatedProgressBar(end=total, width=100)
      progress + limit
      progress.show_progress()
      for place in places:
        result = LatteHotelEs.get_place_by_name(place.get('name'))
        if result["hits"]["total"] > 0:
          place["latte_url"] = result["hits"]["hits"][0]["_source"]["url"]

        if action == 'normal':
          hotel_kind = u'ホテル'
          if place.get('kind') and place.get('kind') in LocationHotelSelectors.REPLACE_HOTEL:
            hotel_kind = place.get('kind')
          else:
            for genre in place['genre']:
              if genre in LocationHotelSelectors.REPLACE_HOTEL:
                hotel_kind = LocationHotelSelectors.REPLACE_HOTEL[genre]
                break
          place['kind'] = hotel_kind
        save_data_to_file(filename, place)
    print " "
  def parse(self, response):
    sel = Selector(response)
    links = sel.xpath(LocationHotelSelectors.LIST_SALONS).extract()
    next_page = self.get_property(sel, LocationHotelSelectors.NEXT_URL)
    print u'links: %s, %s' % (len(links), response.url)

    if len(links) <= 0:
      self.log_message(u'links: %s, %s' % (len(links), response.url))

    if LocationHotelSelectors.is_first_page(sel):
      total = LocationHotelSelectors.get_list_total(sel)
      self.total += total
      if total > 999:
        # yahoo search can not paginate beyond 1000 items
        # so need to run crawler for smaller areas
        self.log_message(u'Pagination overflow: %s' % response.url)
      if self.start_urls[-1] == response.url:
        self.log_message(u'Counted this many places: %s' % self.total)

    if self.scan_mode:
      return

    if links:
      for link in links:
        canonical = link.split('?')[0]
        if LocationHotelEs.check_by_url(canonical):
          # print u'skipped: %s' % link
          continue
        request = Request(link, callback=self.parse_salon, errback=self.parse_err)
        request.meta['page_kind'] = 'salon'
        yield request

    if next_page:
      request = Request(next_page, callback=self.parse, errback=self.parse_err)
      request.meta['page_kind'] = 'list'
      yield request