Exemplo n.º 1
0
def replicate_type():
  """Duplicate a type from type_from to type_to."""
  # build bulk
  from ghost_spider.elastic import LocationRestaurantEs
  from ghost_spider import progressbar
  size = 100
  page = 0
  progress = None
  total = 0
  while True:
    start_from = page * size
    results = LocationRestaurantEs.search({"query": {"match_all": {}}, "size": size, "from": start_from})
    if not progress:
      total = results["hits"]["total"]
      progress = progressbar.AnimatedProgressBar(end=total, width=100)

    if not results["hits"]["hits"]:
      break

    page += 1
    bulk = ""
    for result in results["hits"]["hits"]:
      data = result["_source"]
      bulk += LocationRestaurantEs.bulk_data(data, type_name="restaurants_back")
    progress + size
    progress.show_progress()
    LocationRestaurantEs.send(bulk)
  if progress:
    progress + total
    progress.show_progress()
  print "total %s" % total
Exemplo n.º 2
0
  def dump_restaurant(cls, name, action=None):
    from ghost_spider.elastic import LocationRestaurantEs, LatteRestaurantEs
    from ghost_spider.data import URL_TARGET_URLS
    from ghost_spider import progressbar
    from ghost_spider.data import RST_KINDS_LATE_NOT_ALLOWED

    query = {"query": {"bool": {"must": [{"term": {"prefecture_ascii": name}}], "must_not": []}}}
    if action == 'production':
      query["query"]["bool"]["must"].append({"term": {"version": 10}})

    query["query"]["bool"]["must_not"].append({"terms": {"genre.untouched": RST_KINDS_LATE_NOT_ALLOWED.keys()}})

    progress = None
    total = 0
    page = 1
    limit = 100
    sort = [{"area.untouched": "asc"}]

    save_data_to_file = cls.save_for_production if action == u'production' else cls.save_to_csv

    print "=" * 100
    count_lines = 0

    while True:
      places, total = LocationRestaurantEs.pager(query=query, page=page, size=limit, sort=sort)
      page += 1
      if not places or not len(places):
        break
      if not progress:
        print "Dumping data for %s (%s)" % (name, total)
        progress = progressbar.AnimatedProgressBar(end=total, width=100)
      progress + limit
      progress.show_progress()
      for place in places:
        result = LatteRestaurantEs.get_place_by_name(place.get('name'))
        if result["hits"]["total"] > 0:
          place["latte_url"] = result["hits"]["hits"][0]["_source"]["url"]
          place["latte_url"] = place["latte_url"].replace(URL_TARGET_URLS[0], URL_TARGET_URLS[1])

        place['kind'] = u'|'.join(place['kind'])
        if count_lines % 10000 == 0:
          count = (count_lines / 10000) + 1
          filename = cls.get_filename_by_name(name, count=count, remove_file=True)

        count_lines += 1
        save_data_to_file(filename, place)
    print " "
  def parse(self, response):
    sel = Selector(response)
    links = sel.xpath(LocationHotelSelectors.LIST_SALONS).extract()
    next_page = self.get_property(sel, LocationHotelSelectors.NEXT_URL)
    print u'links: %s, %s' % (len(links), response.url)

    if len(links) <= 0:
      self.log_message(u'links: %s, %s' % (len(links), response.url))

    if LocationHotelSelectors.is_first_page(sel):
      total = LocationHotelSelectors.get_list_total(sel)
      self.total += total
      if total > 999:
        # yahoo search can not paginate beyond 1000 items
        # so need to run crawler for smaller areas or cateories
        page_cat = LocationHotelSelectors.get_category(sel)
        if page_cat and page_cat != "01":
          self.log_message(u'Pagination overflow: %s' % response.url)
        else:
          for category in GOURMET_CATEGORY:
            next_page = response.url.replace('genrecd=01', 'genrecd=%s' % category)
            print u'new links --> %s' % next_page
            request = Request(next_page, callback=self.parse, errback=self.parse_err)
            request.meta['page_kind'] = 'list'
            yield request

      if self.start_urls[-1] == response.url:
        self.log_message(u'Counted this many places: %s' % self.total)

    if self.scan_mode:
      return

    if links:
      for link in links:
        canonical = link.split('?')[0]
        if LocationRestaurantEs.check_by_url(canonical):
          # print u'skipped: %s' % link
          continue
        request = Request(link, callback=self.parse_salon, errback=self.parse_err)
        request.meta['page_kind'] = 'salon'
        yield request

    if next_page:
      request = Request(next_page, callback=self.parse, errback=self.parse_err)
      request.meta['page_kind'] = 'list'
      yield request
Exemplo n.º 4
0
def update_location():
  """Duplicate a type from type_from to type_to."""
  # build bulk
  from ghost_spider.elastic import LocationRestaurantEs
  from ghost_spider.helper import LocationHotelSelectors
  from ghost_spider import progressbar
  import re
  size = 100
  page = 0
  progress = None
  total = 0
  query = {"query": {"match_all": {}}, "size": size, "from": 0}
  query["sort"] = [{"name.untouched": "asc"}]
  while True:
    query["from"] = page * size
    results = LocationRestaurantEs.search(query)
    if not progress:
      total = results["hits"]["total"]
      print "total %s" % total
      progress = progressbar.AnimatedProgressBar(end=total, width=100)
    if not results["hits"]["hits"]:
      break

    page += 1
    bulk = ""
    for result in results["hits"]["hits"]:
      location = result["_source"]
      data_id = result["_id"]
      genre = []
      area_found = False
      for a in location["page_body"]["genre"]:
        result = re.findall(r'genrecd=\d+', a)
        if u'genrecd=01' in result:
          if not area_found:
            text = re.findall(r'>(.*)</', a)
            location['area'] = text[0]
            if location['area']:
              location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer')
            area_found = True
        else:
          text = re.findall(r'>(.*)</', a)
          if text and text[0]:
            genre.append(text[0])
      if not area_found and len(location["page_body"]["breadcrumbs"]) > 1:
        location['area'] = location["page_body"]["breadcrumbs"][-1]
        if location['area']:
          location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer')
      elif not area_found:
        location['area'] = ''
        location['area_ascii'] = ''
      kind = LocationHotelSelectors.convert_latte_kind(genre)
      location['genre'] = genre
      location['kind'] = kind
      bulk += LocationRestaurantEs.bulk_data(location, action="update", data_id=data_id)
    progress + size
    progress.show_progress()
    LocationRestaurantEs.send(bulk)
  if progress:
    progress + total
    progress.show_progress()
  print " "