def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wu_being', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() ###################### crawl_queue.clear() ###################### crawl_queue.push(seed_url) ###################### D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() ###################### except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: ############# # add this new link to queue###################### crawl_queue.push(normalize( seed_url, link)) ###################### crawl_queue.complete(url) ###################### # wait for all download threads to finish threads = [] while threads or crawl_queue: ###################### for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek( ): ####################### # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def thread_crawl(seed_url, max_threads=10, delay=5, user_agent='Aurora-Twinkle', proxies=None, max_retries=1, scrape_callback=None, cache=None): crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, max_retries=max_retries, cache=cache) rp = get_robots(seed_url) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: if rp.can_fetch(user_agent, url): html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print("Error in callback for :{}:{}".format( url, e)) else: for link in links: link = format_link(seed_url, link) crawl_queue.push(link) crawl_queue.complete(url) else: print( 'user_agent: "' + user_agent + '" Blocked by robots.txt:', url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl this website in multiple threads """ # the queue of URL's that still need to be crawled # crawl_queue = Queue.deque([seed_url]) # crawl_queue = [seed_url] crawl_queue = MongoQueue() crawl_queue.push(seed_url) # the URL's that have been seen seen = set([seed_url]) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: # crawl queue is empty break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: link = normalize(seed_url, link) # check whether already crawled this link if link not in seen: seen.add(link) # add this new link to queue crawl_queue.complete(link) # crawl_queue.append(link) # wait for all download threads to finish threads = [] while threads or crawl_queue: # the crawl is still active for thread in threads: if not thread.is_alive(): # remove the stopped threads threads.remove(thread) while len(threads) < max_threads and crawl_queue: # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) # all threads have been processed # sleep temporarily so CPU can focus execution on other threads time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None,\ user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() webpage_cache = MongoCache() # crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \ cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \ opener=None, cache=MongoCache()) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print('Error in callback for: {}: {}'.format(url, e)) else: for link in links: # add this new link to queue crawl_queue.push(normalize(seed_url, link)) if (500 <= webpage_cache[url]['code'] < 600) | (webpage_cache[url]['code'] == -999): crawl_queue.reset(url) else: crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def url_to_mongoqueue(url): """ 把URL写入MongoDB的队列了 """ crawl_queue = MongoQueue('crawl_queue') response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') all_a = soup.find('div', class_='all').find_all('a') for a in all_a: title = a.get_text() url = a['href'] print(title) print(url) crawl_queue.push(url, title)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Download(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print(f'Error in callback for:{url}:{e}') else: for link in links: crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): thread = threading.Thread(target=process_queue) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def link_crawler(seed_url, link_regex=None, proxies=None, delay=1, max_depth=-1, timeout=5, max_thread=5, sleep_time=1, cache=None, scraping_callback=None, debug=False): crawl_queue = MongoQueue() crawl_queue.push(seed_url) d = Downloader(cache=MongoCache(), delay=delay, proxies=proxies, timeout=timeout, debug=debug) def thread_crawl(): while True: try: url = crawl_queue.pop() html = d(url) except KeyError: break except Exception: pass else: links = scraping_callback(url, html) if scraping_callback else [] for link in links: crawl_queue.push(link) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_thread and crawl_queue: t = threading.Thread(target=thread_crawl) t.setDaemon(True) t.start() threads.append(t) time.sleep(sleep_time)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: # add this new link to queue crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def thread_crawler(seed_url, user_agent="wswp", headers=None, proxies=None, num_retries=2, cache=None, scrape_callback=None, max_threads_num=5): """crawl webpage use multipe threads""" crawl_queue = MongoQueue() crawl_queue.push(seed_url) D = Downloader(1, user_agent, headers, proxies, num_retries, cache) def process_task(): while True: try: url = crawl_queue.pop() except KeyError: print("currentlt no urls to process") break else: print("Downloading Thread name is ", sys.thread_info.name) html = D(url) if scrape_callback: try: links = scrape_callback() or [] except Exception as e: print("Error in callback for {}: {}".format(url, e)) else: for link in links: link = normalize(seed_url, link) crawl_queue.push(link) crawl_queue.complete(url) threads = [] while threads or crawl_queue: # the crawl is still alive for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads_num and crawl_queue: thread = Thread(target=process_task) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, link_regex=None, delay=1, cache=None, scrape_callback=None, user_agent='Safari', proxies=None, num_retries=1, max_threads=10, timeout=60): """ 多线程爬虫 多个线程处理一个队列 使用mongo作为队列 """ # crawl_queue = [seed_url] crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) # seen = set([seed_url]) # 黑名单网站 block_filename = os.path.join(BASEDIR, 'blocked_urls.txt') blocked_urls = [i.strip() for i in open(block_filename) if i.strip()] \ if os.path.isfile(block_filename) else [] # save_cache=False为测试需要 D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout, cache=cache, save_cache=False, blocked_urls=blocked_urls) def process_queue(): while 1: try: url = crawl_queue.pop() except (IndexError, KeyError): # 队列为空则停止 break else: html = D(url) if url else None if html and scrape_callback: try: links = scrape_callback(url, html) or [] if link_regex: links.extend(link for link in get_links(html) if re.match(link_regex, link)) except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: link = normalize(seed_url, link) crawl_queue.push(link) # 入列 # if link not in seen: # seen.add(link) # print html # if html: # # 标记为已完成 # crawl_queue.complete(url) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue: thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
startday, endday = calendar.monthrange(year, month) datelist.append([str(year) + '-' + str(month) + '-' + str(1),\ str(year) + '-' + str(month) + '-' + str(endday)]) for date in datelist: returnlist = list(pd.read_csv(CONNECT_PATH + 'waiting.csv')['url']) browser.find_element_by_xpath(startkey).clear() browser.find_element_by_xpath(endkey).clear() browser.find_element_by_xpath(userkey).clear() browser.find_element_by_xpath(startkey).send_keys(date[0]) browser.find_element_by_xpath(endkey).send_keys(date[1]) browser.find_element_by_xpath(userkey).send_keys('公司') browser.find_element_by_xpath(searchclick).click() html = browser.page_source retlist = usere('<a href="(default.aspx?.+?)" target', html) retlist = ['http://www.landchina.com/' + i for i in retlist] retlist = [i.replace('&', '&') for i in retlist] returnlist.extend(retlist) for i in retlist: crawl_queue.push(i) pagenum = int(usere('共(\d+?)页', html)[0]) for i in range(pagenum): browser.find_element_by_link_text('下页').click() html = browser.page_source retlist = usere('<a href="(default.aspx?.+?)" target', html) retlist = ['http://www.landchina.com/' + i for i in retlist] retlist = [i.replace('&', '&') for i in retlist] for i in retlist: crawl_queue.push(i) returnlist.extend(retlist) returndf = pd.DataFrame({'url': returnlist}) returndf.to_csv(CONNECT_PATH + 'waiting.csv', index=False)