예제 #1
0
 def test_throttle2(self, mock_urllib2):
     throttler = crawl_helper.Throttler(5, 1)
     requester = crawl_helper.HTTPRequester(base_url="testurl",
                                            throttler=throttler)
     for i in range(100):
         requester.get()
         time.sleep(0.5)
예제 #2
0
 def test_fetch(self, mock_urllib2):
     config = crawl_helper.FetcherConfig(base_url="testurl")
     throttler1 = crawl_helper.Throttler(5, 1)
     fetcher_pool = crawl_helper.FetcherPool(size=3,
                                             throttlers=[throttler1])
     for i in range(100):
         fetch_task = crawl_helper.FetchTask(
             config=config, process_response=self.process_response)
         fetcher_pool.queue.put(fetch_task)
     while not fetcher_pool.queue.empty():
         time.sleep(0.1)
     time.sleep(2)
     fetcher_pool.stop()
예제 #3
0
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

import pymongo
import time

import crawl_helper

logger = logging.getLogger('tripadvisor_reviews')

# -- globals
conn = pymongo.Connection("localhost", 27017)
db = conn.hotelgenome

throttler1 = crawl_helper.Throttler(5, 1)

base_url = "http://www.tripadvisor.com/"

base_params = {}

config_search = crawl_helper.FetcherConfig(
    base_url=base_url + "Search",
    base_params=base_params,
    response_format=crawl_helper.ResponseFormat.SOUP)

fetcher_pool = crawl_helper.FetcherPool(size=5, throttlers=[throttler1])


def handle_hotel_search(response, context):
    """
예제 #4
0
import time
import urlparse

import crawl_helper

logger = logging.getLogger('foursquare_places')

# -- globals
conn = pymongo.Connection("localhost", 27017)
dbconn = MySQLdb.Connection(host="localhost",
                            user="******",
                            passwd="ean!BogolTola",
                            db="eanprod")
db = conn.hotelgenome

throttler1 = crawl_helper.Throttler(5, 1)
throttler2 = crawl_helper.Throttler(5000, 3600)

base_url = "https://api.foursquare.com/v2/venues/explore"

base_params = {
    "client_id": "Z2Q5CXUGN0BHXON2EC4PFT2ZF3DHY4ZHXT0XOP3FZTDZMYOC",
    "client_secret": "QGB4LUGNOZATFOBLQXX5CL3RFPKOGV4T3JLTDBHKSGHGWQ3B",
}

config_places = crawl_helper.FetcherConfig(
    base_url=base_url,
    base_params=base_params,
    response_format=crawl_helper.ResponseFormat.JSON,
    headers=dict(Accept="application/json"),
)
예제 #5
0
import logging
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')

import pymongo
import time

import crawl_helper


logger = logging.getLogger('ean_hotel_desc')

# -- globals
conn = pymongo.Connection("localhost", 27017)
db = conn.hotelgenome

throttler1 = crawl_helper.Throttler(95000, 86400)

base_url = "https://maps.googleapis.com/maps/api/place/search/json"

base_params = {
        "key": "AIzaSyBxpxs48G5HgdS-yaAAYyLP1LL86pmslMQ",
        "radius": "2000",
        "sensor": "false",
        }

config_places = crawl_helper.FetcherConfig(
        base_url=base_url,
        base_params=base_params,
        response_format=crawl_helper.ResponseFormat.JSON
        )