def __init__(self, concurrent_requests=128, download_delay=0, download_timeout=5, retry_on_timeout=False, queue_size=1024): """ Crawler engine, the brain of this crawler. :param concurrent_requests: how many requests you want to handle simultaneously :param download_delay: download delay for two batches, default is 0 :param download_timeout: download timeout :param retry_on_timeout: failed requests on timeout will be retried when set to True :param queue_size: the size of responses and requests queue """ self.logger = logging.getLogger(__name__) self.status = False self.concurrent_requests = concurrent_requests self.download_delay = download_delay self.engine_idle_timeout = 1.5 * download_timeout self.download_timeout = download_timeout self.retry_on_download_timeout = retry_on_timeout self._requests_queue = Queue(queue_size) self._responses_queue = Queue(queue_size) self._spiders = {} # filter duplicate requests in the queue, we use BloomFilter instead of a set container self._seen = pybloom.ScalableBloomFilter()
def domain_grab(urls, http_obj=None, pool_size=10, retries=5, proxy=None, delay=10, debug=True, queue_links=UberIterator()): if isinstance(urls, basestring): if '\n' in urls: urls = [url.strip() for url in urls.split('\n') if len(url.strip())] else: urls = [urls] domains = {urlparse.urlparse(url).netloc for url in urls} queue_links += urls seen_links = pybloom.ScalableBloomFilter(initial_capacity=100, error_rate=0.001, mode=pybloom.ScalableBloomFilter.SMALL_SET_GROWTH) seen_links.add([url for url in urls]) while queue_links: if debug: progress_counter = 0 progress_total = len(queue_links) for page in multi_grab(queue_links,http_obj=http_obj,pool_size=pool_size,retries=retries,proxy=proxy,delay=delay): if debug: progress_counter += 1 print 'Got %s, Link %s/%s (%s%%)' % (page.final_url,progress_counter,progress_total,int((float(progress_counter)/progress_total)*100)) if urlparse.urlparse(page.final_url).netloc in domains: new_links = {link for link in page.internal_links() if link not in seen_links and link.lower().split('.')[-1] not in ('jpg','gif','jpeg','pdf','doc','docx','ppt','txt')} queue_links += list(new_links) [seen_links.add(link) for link in new_links] yield page if debug: print 'Seen Links: %s' % len(seen_links) print 'Bloom Capacity: %s' % seen_links.capacity print 'Links in Queue: %s' % len(queue_links)
def __init__(self, name=None): if name and not name.endswith('.bloom'): name += '.bloom' self.name = name self.add_counter = 0 try: self.bloom = pybloom.ScalableBloomFilter.fromfile( open(self.name, 'rb')) except: self.bloom = pybloom.ScalableBloomFilter( initial_capacity=100, error_rate=0.001, mode=pybloom.ScalableBloomFilter.SMALL_SET_GROWTH)
def __init__(self, black_patterns=(CONFIG_URLPATTERN_ALL, ), white_patterns=("^http", ), capacity=None): """ constructor """ self.re_black_list = [ re.compile(_pattern, flags=re.IGNORECASE) for _pattern in black_patterns ] self.re_white_list = [ re.compile(_pattern, flags=re.IGNORECASE) for _pattern in white_patterns ] self.url_set = set() if not capacity else None self.bloom_filter = pybloom.ScalableBloomFilter( capacity, error_rate=0.001) if capacity else None return
def __init__(self, is_link_interesting, gui=False, timeout=5, **browser_kwargs): ''' is_link_interesting(a_href, a_text): a function that looks at a link text and target url, and returns True if the crawler should follow the link gui: True if you want to see the crawler timeout: How much to wait for the url to be loaded and JS to execute browser_kwargs: these are passed directly to the spynner module ''' self.timeout = timeout self.is_link_interesting = is_link_interesting # Setup the browser self.download_dir_tmp = tempfile.mkdtemp(prefix='crawler_') browser_config = { 'debug_level': spynner.WARNING, 'download_directory': self.download_dir_tmp, 'user_agent': 'Mozilla/5.0 (compatible; MSIE 9.0;' ' Windows NT 6.1; Trident/5.0)' } browser_config.update(browser_kwargs) self.browser = spynner.browser.Browser(**browser_kwargs) self.browser.set_html_parser(pyquery.PyQuery) if gui: self.browser.create_webview() self.browser.show() # Create the bloom filter self.bloom_filter = pybloom.ScalableBloomFilter() # Create the queue self.queue = Queue.Queue()
def __init__(self, start_items=10000, err_rate=0.0001): self.bloom = pybloom.ScalableBloomFilter(10000, err, 4)