def test_throttle2(self, mock_urllib2): throttler = crawl_helper.Throttler(5, 1) requester = crawl_helper.HTTPRequester(base_url="testurl", throttler=throttler) for i in range(100): requester.get() time.sleep(0.5)
def test_fetch(self, mock_urllib2): config = crawl_helper.FetcherConfig(base_url="testurl") throttler1 = crawl_helper.Throttler(5, 1) fetcher_pool = crawl_helper.FetcherPool(size=3, throttlers=[throttler1]) for i in range(100): fetch_task = crawl_helper.FetchTask( config=config, process_response=self.process_response) fetcher_pool.queue.put(fetch_task) while not fetcher_pool.queue.empty(): time.sleep(0.1) time.sleep(2) fetcher_pool.stop()
import logging logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') import pymongo import time import crawl_helper logger = logging.getLogger('tripadvisor_reviews') # -- globals conn = pymongo.Connection("localhost", 27017) db = conn.hotelgenome throttler1 = crawl_helper.Throttler(5, 1) base_url = "http://www.tripadvisor.com/" base_params = {} config_search = crawl_helper.FetcherConfig( base_url=base_url + "Search", base_params=base_params, response_format=crawl_helper.ResponseFormat.SOUP) fetcher_pool = crawl_helper.FetcherPool(size=5, throttlers=[throttler1]) def handle_hotel_search(response, context): """
import time import urlparse import crawl_helper logger = logging.getLogger('foursquare_places') # -- globals conn = pymongo.Connection("localhost", 27017) dbconn = MySQLdb.Connection(host="localhost", user="******", passwd="ean!BogolTola", db="eanprod") db = conn.hotelgenome throttler1 = crawl_helper.Throttler(5, 1) throttler2 = crawl_helper.Throttler(5000, 3600) base_url = "https://api.foursquare.com/v2/venues/explore" base_params = { "client_id": "Z2Q5CXUGN0BHXON2EC4PFT2ZF3DHY4ZHXT0XOP3FZTDZMYOC", "client_secret": "QGB4LUGNOZATFOBLQXX5CL3RFPKOGV4T3JLTDBHKSGHGWQ3B", } config_places = crawl_helper.FetcherConfig( base_url=base_url, base_params=base_params, response_format=crawl_helper.ResponseFormat.JSON, headers=dict(Accept="application/json"), )
import logging logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s') import pymongo import time import crawl_helper logger = logging.getLogger('ean_hotel_desc') # -- globals conn = pymongo.Connection("localhost", 27017) db = conn.hotelgenome throttler1 = crawl_helper.Throttler(95000, 86400) base_url = "https://maps.googleapis.com/maps/api/place/search/json" base_params = { "key": "AIzaSyBxpxs48G5HgdS-yaAAYyLP1LL86pmslMQ", "radius": "2000", "sensor": "false", } config_places = crawl_helper.FetcherConfig( base_url=base_url, base_params=base_params, response_format=crawl_helper.ResponseFormat.JSON )