def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wu_being', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() ###################### crawl_queue.clear() ###################### crawl_queue.push(seed_url) ###################### D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() ###################### except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: ############# # add this new link to queue###################### crawl_queue.push(normalize( seed_url, link)) ###################### crawl_queue.complete(url) ###################### # wait for all download threads to finish threads = [] while threads or crawl_queue: ###################### for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek( ): ####################### # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled urllist = MongoQueue() #查找是否有状态为0的数据,返回一个true或者false def process_queue(): while True: # keep track that are processing url try: url = urllist.pop() print('url', url) D = Download() D.Downloader(url) except KeyError: # currently no urls to process break # wait for all download threads to finish threads = [] while threads or urllist: for thread in threads: if not thread.is_alive(): threads.remove(thread) print(urllist.peek() is True) if urllist.peek(): while len(threads) < max_threads: # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) else: break time.sleep(SLEEP_TIME)
def thread_crawl(seed_url, max_threads=10, delay=5, user_agent='Aurora-Twinkle', proxies=None, max_retries=1, scrape_callback=None, cache=None): crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, max_retries=max_retries, cache=cache) rp = get_robots(seed_url) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: if rp.can_fetch(user_agent, url): html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print("Error in callback for :{}:{}".format( url, e)) else: for link in links: link = format_link(seed_url, link) crawl_queue.push(link) crawl_queue.complete(url) else: print( 'user_agent: "' + user_agent + '" Blocked by robots.txt:', url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None,\ user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() webpage_cache = MongoCache() # crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \ cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \ opener=None, cache=MongoCache()) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print('Error in callback for: {}: {}'.format(url, e)) else: for link in links: # add this new link to queue crawl_queue.push(normalize(seed_url, link)) if (500 <= webpage_cache[url]['code'] < 600) | (webpage_cache[url]['code'] == -999): crawl_queue.reset(url) else: crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Download(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print(f'Error in callback for:{url}:{e}') else: for link in links: crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): thread = threading.Thread(target=process_queue) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: # add this new link to queue crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)