def threaded_crawler(seed_url,
                     delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wu_being',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()  ######################
    crawl_queue.clear()  ######################
    crawl_queue.push(seed_url)  ######################
    D = Downloader(cache=cache,
                   delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()  ######################
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:  #############
                            # add this new link to queue######################
                            crawl_queue.push(normalize(
                                seed_url, link))  ######################
                crawl_queue.complete(url)  ######################

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:  ######################
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek(
        ):  #######################
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Пример #2
0
def thread_crawl(seed_url,
                 max_threads=10,
                 delay=5,
                 user_agent='Aurora-Twinkle',
                 proxies=None,
                 max_retries=1,
                 scrape_callback=None,
                 cache=None):
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   max_retries=max_retries,
                   cache=cache)
    rp = get_robots(seed_url)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                if rp.can_fetch(user_agent, url):
                    html = D(url)
                    if scrape_callback:
                        try:
                            links = scrape_callback(url, html) or []
                        except Exception as e:
                            print("Error in callback for :{}:{}".format(
                                url, e))
                        else:
                            for link in links:
                                link = format_link(seed_url, link)
                                crawl_queue.push(link)
                    crawl_queue.complete(url)
                else:
                    print(
                        'user_agent: "' + user_agent +
                        '" Blocked by robots.txt:', url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)

        while len(threads) < max_threads and crawl_queue.peek():
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)

        time.sleep(SLEEP_TIME)
Пример #3
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl this website in multiple threads
    """
    # the queue of URL's that still need to be crawled
    # crawl_queue = Queue.deque([seed_url])
    #  crawl_queue = [seed_url]
    crawl_queue = MongoQueue()
    crawl_queue.push(seed_url)
    # the URL's that have been seen
    seen = set([seed_url])
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                # crawl queue is empty
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            # check whether already crawled this link
                            if link not in seen:
                                seen.add(link)
                                # add this new link to queue
                                crawl_queue.complete(link)
                                #  crawl_queue.append(link)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        # the crawl is still active
        for thread in threads:
            if not thread.is_alive():
                # remove the stopped threads
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        # all threads have been processed
        # sleep temporarily so CPU can focus execution on other threads
        time.sleep(SLEEP_TIME)
Пример #4
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None,\
 user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    webpage_cache = MongoCache()
    # crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \
            cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \
            opener=None, cache=MongoCache())

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print('Error in callback for: {}: {}'.format(url, e))
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
            if (500 <= webpage_cache[url]['code'] <
                    600) | (webpage_cache[url]['code'] == -999):
                crawl_queue.reset(url)
            else:
                crawl_queue.complete(url)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Пример #5
0
def url_to_mongoqueue(url):
    """
    把URL写入MongoDB的队列了
    """
    crawl_queue = MongoQueue('crawl_queue')
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    all_a = soup.find('div', class_='all').find_all('a')
    for a in all_a:
        title = a.get_text()
        url = a['href']
        print(title)
        print(url)
        crawl_queue.push(url, title)
Пример #6
0
def threaded_crawler(seed_url,
                     delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Download(cache=cache,
                 delay=delay,
                 user_agent=user_agent,
                 proxies=proxies,
                 num_retries=num_retries,
                 timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print(f'Error in callback for:{url}:{e}')
                    else:
                        for link in links:
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)

        while len(threads) < max_threads and crawl_queue.peek():
            thread = threading.Thread(target=process_queue)
            thread.start()
            threads.append(thread)

            time.sleep(SLEEP_TIME)
Пример #7
0
def link_crawler(seed_url,
                 link_regex=None,
                 proxies=None,
                 delay=1,
                 max_depth=-1,
                 timeout=5,
                 max_thread=5,
                 sleep_time=1,
                 cache=None,
                 scraping_callback=None,
                 debug=False):
    crawl_queue = MongoQueue()
    crawl_queue.push(seed_url)
    d = Downloader(cache=MongoCache(),
                   delay=delay,
                   proxies=proxies,
                   timeout=timeout,
                   debug=debug)

    def thread_crawl():
        while True:
            try:
                url = crawl_queue.pop()
                html = d(url)
            except KeyError:
                break
            except Exception:
                pass
            else:
                links = scraping_callback(url,
                                          html) if scraping_callback else []
                for link in links:
                    crawl_queue.push(link)
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_thread and crawl_queue:
            t = threading.Thread(target=thread_crawl)
            t.setDaemon(True)
            t.start()
            threads.append(t)
        time.sleep(sleep_time)
Пример #8
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)


    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Пример #9
0
def thread_crawler(seed_url, user_agent="wswp", headers=None, proxies=None,
                   num_retries=2, cache=None, scrape_callback=None,
                   max_threads_num=5):
    """crawl webpage use multipe threads"""
    crawl_queue = MongoQueue()
    crawl_queue.push(seed_url)

    D = Downloader(1, user_agent, headers, proxies, num_retries, cache)

    def process_task():
        while True:
            try:
                url = crawl_queue.pop()
            except KeyError:
                print("currentlt no urls to process")
                break
            else:
                print("Downloading Thread name is ", sys.thread_info.name)
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback() or []
                    except Exception as e:
                        print("Error in callback for {}: {}".format(url, e))
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            crawl_queue.push(link)
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        # the crawl is still alive
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads_num and crawl_queue:
            thread = Thread(target=process_task)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)

        time.sleep(SLEEP_TIME)
Пример #10
0
def threaded_crawler(seed_url,
                     link_regex=None,
                     delay=1,
                     cache=None,
                     scrape_callback=None,
                     user_agent='Safari',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """
    多线程爬虫
    多个线程处理一个队列
    使用mongo作为队列
    """
    # crawl_queue = [seed_url]
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)

    # seen = set([seed_url])

    # 黑名单网站
    block_filename = os.path.join(BASEDIR, 'blocked_urls.txt')
    blocked_urls = [i.strip() for i in open(block_filename) if i.strip()] \
        if os.path.isfile(block_filename) else []
    # save_cache=False为测试需要
    D = Downloader(delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout,
                   cache=cache,
                   save_cache=False,
                   blocked_urls=blocked_urls)

    def process_queue():
        while 1:
            try:
                url = crawl_queue.pop()
            except (IndexError, KeyError):
                # 队列为空则停止
                break
            else:
                html = D(url) if url else None
                if html and scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                        if link_regex:
                            links.extend(link for link in get_links(html)
                                         if re.match(link_regex, link))

                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            crawl_queue.push(link)  # 入列
                            # if link not in seen:
                            #     seen.add(link)
                # print html
                # if html:
                #     # 标记为已完成
                #     crawl_queue.complete(url)
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Пример #11
0
        startday, endday = calendar.monthrange(year, month)
        datelist.append([str(year) + '-' + str(month) + '-' + str(1),\
         str(year) + '-' + str(month) + '-' + str(endday)])
for date in datelist:
    returnlist = list(pd.read_csv(CONNECT_PATH + 'waiting.csv')['url'])
    browser.find_element_by_xpath(startkey).clear()
    browser.find_element_by_xpath(endkey).clear()
    browser.find_element_by_xpath(userkey).clear()
    browser.find_element_by_xpath(startkey).send_keys(date[0])
    browser.find_element_by_xpath(endkey).send_keys(date[1])
    browser.find_element_by_xpath(userkey).send_keys('公司')
    browser.find_element_by_xpath(searchclick).click()
    html = browser.page_source
    retlist = usere('<a href="(default.aspx?.+?)" target', html)
    retlist = ['http://www.landchina.com/' + i for i in retlist]
    retlist = [i.replace('&amp;', '&') for i in retlist]
    returnlist.extend(retlist)
    for i in retlist:
        crawl_queue.push(i)
    pagenum = int(usere('共(\d+?)页', html)[0])
    for i in range(pagenum):
        browser.find_element_by_link_text('下页').click()
        html = browser.page_source
        retlist = usere('<a href="(default.aspx?.+?)" target', html)
        retlist = ['http://www.landchina.com/' + i for i in retlist]
        retlist = [i.replace('&amp;', '&') for i in retlist]
        for i in retlist:
            crawl_queue.push(i)
        returnlist.extend(retlist)
        returndf = pd.DataFrame({'url': returnlist})
        returndf.to_csv(CONNECT_PATH + 'waiting.csv', index=False)