示例#1
0
 def test_fetch(self, mock_urllib2):
     config = crawl_helper.FetcherConfig(base_url="testurl")
     throttler1 = crawl_helper.Throttler(5, 1)
     fetcher_pool = crawl_helper.FetcherPool(size=3,
                                             throttlers=[throttler1])
     for i in range(100):
         fetch_task = crawl_helper.FetchTask(
             config=config, process_response=self.process_response)
         fetcher_pool.queue.put(fetch_task)
     while not fetcher_pool.queue.empty():
         time.sleep(0.1)
     time.sleep(2)
     fetcher_pool.stop()
示例#2
0
def enqueue_followon_request(response, context):
    """
    enqueue a request when "moreResultsAvailable" is in response
    """
    params = {}
    params["cachekey"] = response.get("HotelListResponse",
                                      {}).get("cacheKey", "")
    params["cachelocation"] = response.get("HotelListResponse",
                                           {}).get("cacheLocation", "")
    params["customersessionid"] = response.get("HotelListResponse",
                                               {}).get("customerSessionId", "")
    task = crawl_helper.FetchTask(
        config=config_list,
        context=context,
        params=params,
        process_response=handle_hotel_list,
    )
    fetcher_pool.queue.put(task)
示例#3
0
def enqueue_request(hotel_ids, date_from, date_to):
    """
    enqueue a single request; dates passed as strings
    """
    params = {
        "hotelIdList": ",".join([str(h) for h in hotel_ids]),
        "arrivalDate": date_from,
        "departureDate": date_to,
    }
    context = {
        "arrivalDate": date_from,
        "departureDate": date_to,
    }
    task = crawl_helper.FetchTask(
        config=config_list,
        context=context,
        params=params,
        process_response=handle_hotel_list,
    )
    fetcher_pool.queue.put(task)
示例#4
0
def handle_places_response(response, context):
    """
    handler - response to places search
    """
    if "next_page_token" in response:
        next_page_token = response["next_page_token"]
    else:
        next_page_token = None
    places = response["results"]
    response = {}
    response["hotelId"] = context["hotelId"]
    response["places"] = places
    try:
        rec = db.hotel_pois.find_one({"hotelId": response["hotelId"]})
        if not rec:
            logger.debug("inserting hotel id %s" % (response["hotelId"]))
            db.hotel_pois.insert(response)
        else:
            logger.debug("updating hotel id %s" % (response["hotelId"]))
            places = rec["places"] + places
            db.hotel_pois.update(
                    {"hotelId": rec["hotelId"]},
                    {"$set": {"places": places}},
                    )
    except pymongo.errors.PyMongoError as e:
        logger.error("Failure %s" % (e.reason))
    if next_page_token:
        params = {
                "location": "%s,%s" % (
                    str(context["latitude"]), str(context["longitude"])),
                "pagetoken": next_page_token,
                }
        task = crawl_helper.FetchTask(
                config=config_places,
                context=context,
                params=params,
                process_response=handle_places_response,
                )
        fetcher_pool.queue.put(task)
示例#5
0
def handle_places_response(response, context):
    """
    handler - response to places search
    """
    if "next" in response["results"]:
        next_page = response["results"]["next"]
    else:
        next_page = None
    places = response["results"]["items"]
    response = {}
    response["hotelId"] = context["hotelId"]
    response["places"] = places
    try:
        rec = db.hotel_pois_nokia.find_one({"hotelId": response["hotelId"]})
        if not rec:
            logger.debug("inserting hotel id %s" % (response["hotelId"]))
            db.hotel_pois_nokia.insert(response)
        else:
            logger.debug("updating hotel id %s" % (response["hotelId"]))
            places = rec["places"] + places
            db.hotel_pois_nokia.update(
                {"hotelId": rec["hotelId"]},
                {"$set": {
                    "places": places
                }},
            )
    except pymongo.errors.PyMongoError as e:
        logger.error("Failure %s" % (e.reason))
    if next_page:
        params, params_no_encode = get_next_page_params(next_page, context)
        task = crawl_helper.FetchTask(
            config=config_places,
            context=context,
            params=params,
            params_no_encode=params_no_encode,
            process_response=handle_places_response,
        )
        fetcher_pool.queue.put(task)
示例#6
0
    rows = cursor.fetchall()
    logger.info("[X] Searching %d hotels" % len(rows))
    for r in rows:
        hotel_id, name, city, country = r
        name = remove_non_ascii(name)
        context = {
            'hotel_id': hotel_id,
            'hotel_name': name,
        }
        search_param = name + " " + city + " " + country
        params = {
            "q": search_param,
        }
        task = crawl_helper.FetchTask(
            config=config_search,
            context=context,
            params=params,
            process_response=handle_hotel_search,
        )
        fetcher_pool.queue.put(task)
    cursor.close()

    # 2. Wait till queue is drained

    step_wait()

    # 3. Stop the pool

    logger.info("[X] Stopping fetcher pool")
    fetcher_pool.stop()
示例#7
0
    ]
    for h in hotels:
        if db.hotel_pois_nokia.find_one({"hotelId": h["hotelId"]}):
            continue
        if all(k in h for k in ("latitude", "longitude")):
            params = dict(size=200)
            params_no_encode = "in=%s,%s;r=5000" % (str(
                h["latitude"]), str(h["longitude"]))
            context = {
                "hotelId": h["hotelId"],
                "latitude": h["latitude"],
                "longitude": h["longitude"],
            }
            task = crawl_helper.FetchTask(
                config=config_places,
                context=context,
                params=params,
                params_no_encode=params_no_encode,
                process_response=handle_places_response,
            )
            fetcher_pool.queue.put(task)

# 2. Wait till queue is drained

    step_wait()

    # 3. Stop the pool

    logger.info("[X] Stopping fetcher pool")
    fetcher_pool.stop()
示例#8
0
        "Phuket, Thailand",
        "San Diego, California",
        "Goa, India",
    ]

    # 1. Fetch hotels and then descriptions for the hotels

    logger.info("[X] Starting fetch hotels...")
    for d in dests:
        params = {
            "destinationString": d,
            "supplierCacheTolerance": "MED_ENHANCED",
        }
        task = crawl_helper.FetchTask(
            config=config_list,
            params=params,
            process_response=handle_hotel_list,
        )
        fetcher_pool.queue.put(task)

# 2. Wait till queue is drained

    step_wait()

    # 3. Fetch descriptions
    logger.info("[X] Starting fetch descriptions...")
    hotels = db.hotels.find({}, {"hotelId": 1})
    for h in hotels:
        try:
            if db.hotel_descs.find_one({"@hotelId": str(h["hotelId"])}):
                continue