def test_fetch(self, mock_urllib2): config = crawl_helper.FetcherConfig(base_url="testurl") throttler1 = crawl_helper.Throttler(5, 1) fetcher_pool = crawl_helper.FetcherPool(size=3, throttlers=[throttler1]) for i in range(100): fetch_task = crawl_helper.FetchTask( config=config, process_response=self.process_response) fetcher_pool.queue.put(fetch_task) while not fetcher_pool.queue.empty(): time.sleep(0.1) time.sleep(2) fetcher_pool.stop()
def enqueue_followon_request(response, context): """ enqueue a request when "moreResultsAvailable" is in response """ params = {} params["cachekey"] = response.get("HotelListResponse", {}).get("cacheKey", "") params["cachelocation"] = response.get("HotelListResponse", {}).get("cacheLocation", "") params["customersessionid"] = response.get("HotelListResponse", {}).get("customerSessionId", "") task = crawl_helper.FetchTask( config=config_list, context=context, params=params, process_response=handle_hotel_list, ) fetcher_pool.queue.put(task)
def enqueue_request(hotel_ids, date_from, date_to): """ enqueue a single request; dates passed as strings """ params = { "hotelIdList": ",".join([str(h) for h in hotel_ids]), "arrivalDate": date_from, "departureDate": date_to, } context = { "arrivalDate": date_from, "departureDate": date_to, } task = crawl_helper.FetchTask( config=config_list, context=context, params=params, process_response=handle_hotel_list, ) fetcher_pool.queue.put(task)
def handle_places_response(response, context): """ handler - response to places search """ if "next_page_token" in response: next_page_token = response["next_page_token"] else: next_page_token = None places = response["results"] response = {} response["hotelId"] = context["hotelId"] response["places"] = places try: rec = db.hotel_pois.find_one({"hotelId": response["hotelId"]}) if not rec: logger.debug("inserting hotel id %s" % (response["hotelId"])) db.hotel_pois.insert(response) else: logger.debug("updating hotel id %s" % (response["hotelId"])) places = rec["places"] + places db.hotel_pois.update( {"hotelId": rec["hotelId"]}, {"$set": {"places": places}}, ) except pymongo.errors.PyMongoError as e: logger.error("Failure %s" % (e.reason)) if next_page_token: params = { "location": "%s,%s" % ( str(context["latitude"]), str(context["longitude"])), "pagetoken": next_page_token, } task = crawl_helper.FetchTask( config=config_places, context=context, params=params, process_response=handle_places_response, ) fetcher_pool.queue.put(task)
def handle_places_response(response, context): """ handler - response to places search """ if "next" in response["results"]: next_page = response["results"]["next"] else: next_page = None places = response["results"]["items"] response = {} response["hotelId"] = context["hotelId"] response["places"] = places try: rec = db.hotel_pois_nokia.find_one({"hotelId": response["hotelId"]}) if not rec: logger.debug("inserting hotel id %s" % (response["hotelId"])) db.hotel_pois_nokia.insert(response) else: logger.debug("updating hotel id %s" % (response["hotelId"])) places = rec["places"] + places db.hotel_pois_nokia.update( {"hotelId": rec["hotelId"]}, {"$set": { "places": places }}, ) except pymongo.errors.PyMongoError as e: logger.error("Failure %s" % (e.reason)) if next_page: params, params_no_encode = get_next_page_params(next_page, context) task = crawl_helper.FetchTask( config=config_places, context=context, params=params, params_no_encode=params_no_encode, process_response=handle_places_response, ) fetcher_pool.queue.put(task)
rows = cursor.fetchall() logger.info("[X] Searching %d hotels" % len(rows)) for r in rows: hotel_id, name, city, country = r name = remove_non_ascii(name) context = { 'hotel_id': hotel_id, 'hotel_name': name, } search_param = name + " " + city + " " + country params = { "q": search_param, } task = crawl_helper.FetchTask( config=config_search, context=context, params=params, process_response=handle_hotel_search, ) fetcher_pool.queue.put(task) cursor.close() # 2. Wait till queue is drained step_wait() # 3. Stop the pool logger.info("[X] Stopping fetcher pool") fetcher_pool.stop()
] for h in hotels: if db.hotel_pois_nokia.find_one({"hotelId": h["hotelId"]}): continue if all(k in h for k in ("latitude", "longitude")): params = dict(size=200) params_no_encode = "in=%s,%s;r=5000" % (str( h["latitude"]), str(h["longitude"])) context = { "hotelId": h["hotelId"], "latitude": h["latitude"], "longitude": h["longitude"], } task = crawl_helper.FetchTask( config=config_places, context=context, params=params, params_no_encode=params_no_encode, process_response=handle_places_response, ) fetcher_pool.queue.put(task) # 2. Wait till queue is drained step_wait() # 3. Stop the pool logger.info("[X] Stopping fetcher pool") fetcher_pool.stop()
"Phuket, Thailand", "San Diego, California", "Goa, India", ] # 1. Fetch hotels and then descriptions for the hotels logger.info("[X] Starting fetch hotels...") for d in dests: params = { "destinationString": d, "supplierCacheTolerance": "MED_ENHANCED", } task = crawl_helper.FetchTask( config=config_list, params=params, process_response=handle_hotel_list, ) fetcher_pool.queue.put(task) # 2. Wait till queue is drained step_wait() # 3. Fetch descriptions logger.info("[X] Starting fetch descriptions...") hotels = db.hotels.find({}, {"hotelId": 1}) for h in hotels: try: if db.hotel_descs.find_one({"@hotelId": str(h["hotelId"])}): continue