def dump_hotel(cls, name, action='normal'): from ghost_spider.elastic import LocationHotelEs, LatteHotelEs from ghost_spider import progressbar filename = cls.get_filename_by_name(name) query = {"query": {"bool": {"must": [{"term": {"prefecture_ascii": name}}], "must_not": []}}} if action == 'recover': query["query"]["bool"]["must"].append({"term": {"recovered": "1"}}) filename = cls.get_filename_by_name(u'%s_recover' % name) elif action == 'production': filename = cls.get_filename_by_name(u'%s_production' % name) query["query"]["bool"]["must"].append({"term": {"version": 10}}) query["query"]["bool"]["must_not"].append({"term": {"genre": u'ラブホテル'}}) if os.path.exists(filename): os.remove(filename) progress = None total = 0 page = 1 limit = 100 sort = [{"area.untouched": "asc"}] save_data_to_file = cls.save_for_production if action == u'production' else cls.save_to_csv print "=" * 100 while True: places, total = LocationHotelEs.pager(query=query, page=page, size=limit, sort=sort) page += 1 if not places or not len(places): break if not progress: print "Dumping data for %s (%s)" % (name, total) progress = progressbar.AnimatedProgressBar(end=total, width=100) progress + limit progress.show_progress() for place in places: result = LatteHotelEs.get_place_by_name(place.get('name')) if result["hits"]["total"] > 0: place["latte_url"] = result["hits"]["hits"][0]["_source"]["url"] if action == 'normal': hotel_kind = u'ホテル' if place.get('kind') and place.get('kind') in LocationHotelSelectors.REPLACE_HOTEL: hotel_kind = place.get('kind') else: for genre in place['genre']: if genre in LocationHotelSelectors.REPLACE_HOTEL: hotel_kind = LocationHotelSelectors.REPLACE_HOTEL[genre] break place['kind'] = hotel_kind save_data_to_file(filename, place) print " "
def parse(self, response): sel = Selector(response) links = sel.xpath(LocationHotelSelectors.LIST_SALONS).extract() next_page = self.get_property(sel, LocationHotelSelectors.NEXT_URL) print u'links: %s, %s' % (len(links), response.url) if len(links) <= 0: self.log_message(u'links: %s, %s' % (len(links), response.url)) if LocationHotelSelectors.is_first_page(sel): total = LocationHotelSelectors.get_list_total(sel) self.total += total if total > 999: # yahoo search can not paginate beyond 1000 items # so need to run crawler for smaller areas self.log_message(u'Pagination overflow: %s' % response.url) if self.start_urls[-1] == response.url: self.log_message(u'Counted this many places: %s' % self.total) if self.scan_mode: return if links: for link in links: canonical = link.split('?')[0] if LocationHotelEs.check_by_url(canonical): # print u'skipped: %s' % link continue request = Request(link, callback=self.parse_salon, errback=self.parse_err) request.meta['page_kind'] = 'salon' yield request if next_page: request = Request(next_page, callback=self.parse, errback=self.parse_err) request.meta['page_kind'] = 'list' yield request