예제 #1
0
def main(max_threads=5):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    queue = MongoQueue()

    urls = []
    temple = scrape_callback.seed_url[0:-2]
    for i in range(1, 1189, 1):
        urls.append(temple + str(i) + '/')

    while True:
        now = datetime.now()
        if now.hour < 3 or now.hour > 12:
            queue.repairFast()
            process_crawler(
                urls,
                scrape_callback=scrape_callback,
                cache=cache,
                max_threads=max_threads,
                timeout=30,
                host=urlparse.urlparse(scrape_callback.seed_url).netloc,
                user_agent=
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
            )
        else:
            print 'pass:' + str(now)
            pass
        time.sleep(3600)
예제 #2
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl this website in multiple threads
    """
    # the queue of URL's that still need to be crawled
    # crawl_queue = Queue.deque([seed_url])
    #  crawl_queue = [seed_url]
    crawl_queue = MongoQueue()
    crawl_queue.push(seed_url)
    # the URL's that have been seen
    seen = set([seed_url])
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                # crawl queue is empty
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            # check whether already crawled this link
                            if link not in seen:
                                seen.add(link)
                                # add this new link to queue
                                crawl_queue.complete(link)
                                #  crawl_queue.append(link)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        # the crawl is still active
        for thread in threads:
            if not thread.is_alive():
                # remove the stopped threads
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        # all threads have been processed
        # sleep temporarily so CPU can focus execution on other threads
        time.sleep(SLEEP_TIME)
예제 #3
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None,\
 user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    webpage_cache = MongoCache()
    # crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \
            cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \
            opener=None, cache=MongoCache())

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print('Error in callback for: {}: {}'.format(url, e))
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
            if (500 <= webpage_cache[url]['code'] <
                    600) | (webpage_cache[url]['code'] == -999):
                crawl_queue.reset(url)
            else:
                crawl_queue.complete(url)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
예제 #4
0
def url_to_mongoqueue(url):
    """
    把URL写入MongoDB的队列了
    """
    crawl_queue = MongoQueue('crawl_queue')
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    all_a = soup.find('div', class_='all').find_all('a')
    for a in all_a:
        title = a.get_text()
        url = a['href']
        print(title)
        print(url)
        crawl_queue.push(url, title)
예제 #5
0
def link_crawler(seed_url,
                 link_regex=None,
                 proxies=None,
                 delay=1,
                 max_depth=-1,
                 timeout=5,
                 max_thread=5,
                 sleep_time=1,
                 cache=None,
                 scraping_callback=None,
                 debug=False):
    crawl_queue = MongoQueue()
    crawl_queue.push(seed_url)
    d = Downloader(cache=MongoCache(),
                   delay=delay,
                   proxies=proxies,
                   timeout=timeout,
                   debug=debug)

    def thread_crawl():
        while True:
            try:
                url = crawl_queue.pop()
                html = d(url)
            except KeyError:
                break
            except Exception:
                pass
            else:
                links = scraping_callback(url,
                                          html) if scraping_callback else []
                for link in links:
                    crawl_queue.push(link)
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_thread and crawl_queue:
            t = threading.Thread(target=thread_crawl)
            t.setDaemon(True)
            t.start()
            threads.append(t)
        time.sleep(sleep_time)
def threaded_crawler(seed_url,
                     delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wu_being',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()  ######################
    crawl_queue.clear()  ######################
    crawl_queue.push(seed_url)  ######################
    D = Downloader(cache=cache,
                   delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()  ######################
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:  #############
                            # add this new link to queue######################
                            crawl_queue.push(normalize(
                                seed_url, link))  ######################
                crawl_queue.complete(url)  ######################

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:  ######################
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek(
        ):  #######################
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
예제 #7
0
def threaded_crawler(delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    urllist = MongoQueue()  #查找是否有状态为0的数据,返回一个true或者false

    def process_queue():
        while True:
            # keep track that are processing url

            try:
                url = urllist.pop()
                print('url', url)
                D = Download()
                D.Downloader(url)
            except KeyError:
                # currently no urls to process
                break

    # wait for all download threads to finish
    threads = []
    while threads or urllist:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        print(urllist.peek() is True)
        if urllist.peek():
            while len(threads) < max_threads:
                # can start some more threads
                thread = threading.Thread(target=process_queue)
                thread.setDaemon(
                    True
                )  # set daemon so main thread can exit when receives ctrl-c
                thread.start()
                threads.append(thread)
        else:
            break
        time.sleep(SLEEP_TIME)
예제 #8
0
 def __init__(self, max_urls=1000):
     self.max_urls = max_urls
     #http://m.biquge.biz/top/allvisit_1/
     #http://m.benbenwx.com/top/allvisit_1/
     #http://m.moliwenxue.com/top/allvisit_1/
     #http://m.boluoxs.com/top/allvisit_1/
     self.seed_url = 'http://m.boluoxs.com/top/allvisit_1/'
     self.queue = MongoQueue()
     self.book_data = BooKListDao()
예제 #9
0
def thread_crawler(seed_url, user_agent="wswp", headers=None, proxies=None,
                   num_retries=2, cache=None, scrape_callback=None,
                   max_threads_num=5):
    """crawl webpage use multipe threads"""
    crawl_queue = MongoQueue()
    crawl_queue.push(seed_url)

    D = Downloader(1, user_agent, headers, proxies, num_retries, cache)

    def process_task():
        while True:
            try:
                url = crawl_queue.pop()
            except KeyError:
                print("currentlt no urls to process")
                break
            else:
                print("Downloading Thread name is ", sys.thread_info.name)
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback() or []
                    except Exception as e:
                        print("Error in callback for {}: {}".format(url, e))
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            crawl_queue.push(link)
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        # the crawl is still alive
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads_num and crawl_queue:
            thread = Thread(target=process_task)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)

        time.sleep(SLEEP_TIME)
예제 #10
0
def thread_crawl(seed_url,
                 max_threads=10,
                 delay=5,
                 user_agent='Aurora-Twinkle',
                 proxies=None,
                 max_retries=1,
                 scrape_callback=None,
                 cache=None):
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   max_retries=max_retries,
                   cache=cache)
    rp = get_robots(seed_url)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                if rp.can_fetch(user_agent, url):
                    html = D(url)
                    if scrape_callback:
                        try:
                            links = scrape_callback(url, html) or []
                        except Exception as e:
                            print("Error in callback for :{}:{}".format(
                                url, e))
                        else:
                            for link in links:
                                link = format_link(seed_url, link)
                                crawl_queue.push(link)
                    crawl_queue.complete(url)
                else:
                    print(
                        'user_agent: "' + user_agent +
                        '" Blocked by robots.txt:', url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)

        while len(threads) < max_threads and crawl_queue.peek():
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)

        time.sleep(SLEEP_TIME)
def threaded_crawler(delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    urllist = MongoQueue()  #查找是否有状态为0的数据,返回一个true或者false
    D = Download()
    loop = asyncio.get_event_loop()
    while True:
        try:
            url = urllist.pop()
            print(url)
            tasks = [asyncio.ensure_future(D.Downloader(url))] * 10
            loop.run_until_complete(asyncio.wait(tasks))
            urllist.complete(url)
        except KeyError:
            break
    '''
예제 #12
0
    def __init__(self, max_urls=1000):
        self.max_urls = max_urls
        #http://m.biquge.biz/top/allvisit_1/
        #http://m.benbenwx.com/top/allvisit_1/
        #http://m.moliwenxue.com/top/allvisit_1/
        #http://m.boluoxs.com/top/allvisit_1/
        #self.seed_url = 'http://m.junzige.la/top/allvisit_400/'

        self.urls = []


        # self.seed_url = 'http://www.junzige.la/'
        self.seed_url = 'http://www.boluoxs.com/'
        self.queue = MongoQueue()
        self.book_data = BooKCatlogDao()
예제 #13
0
def threaded_crawler(seed_url,
                     delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Download(cache=cache,
                 delay=delay,
                 user_agent=user_agent,
                 proxies=proxies,
                 num_retries=num_retries,
                 timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print(f'Error in callback for:{url}:{e}')
                    else:
                        for link in links:
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)

        while len(threads) < max_threads and crawl_queue.peek():
            thread = threading.Thread(target=process_queue)
            thread.start()
            threads.append(thread)

            time.sleep(SLEEP_TIME)
예제 #14
0
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)


    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
예제 #15
0
def main(max_threads = 5):
    catlog_callback = AlexaCallback()
    cache = MongoCache()
    queue = MongoQueue()


    client = MongoClient('localhost', 27017, connect=False)
        #create collection to store cached webpages,
        # which is the equivalent of a table in a relational database
    db = client.cache
    cursor = db.books.find()

    urls = []
    while cursor.alive:
        temp = cursor.next()
        temp = temp['link']

        if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la':
            temp = '/novel' + temp[5:-4] + '/'
            temp = normalize(catlog_callback.seed_url, temp)
        elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com':
            temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/'

        print temp
        urls.append(temp)

    print urls[0]

    while True:
        now = datetime.now()

        if now.hour < 3 or now.hour > 12:
            queue.repairFast()
            process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36')
            # every time finished, clear the job queue
            queue.clear()
        else:
            print 'pass:' + str(now)
            pass
        time.sleep(3600)
예제 #16
0
import pandas as pd
import re
import numpy as np
from process_crawler import process_crawler
from mongo_queue import MongoQueue
from mongo_cache import MongoCache
from mongo_info import MongoInfo
from downloader import Downloader
from lxml import etree

crawl_queue = MongoQueue()
webpage_cache = MongoCache()
DEFAULT_AGENT = {}
DEFAULT_DELAY = 5
DEFAULT_RETRIES = 1
DEFAULT_TIMEOUT = 100
DEFAULT_PROXY_LIST = '/Users/apple/Desktop/connect/proxylist/proxies.csv'
DEFAULT_COOKIE = {}

D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \
        cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \
        opener=None, cache=MongoCache())

def usere(regex, getcontent): #定义使用正则表达式的函数
    pattern = re.compile(regex)
    content = re.findall(pattern, getcontent)
    return content

#Obtain target urls
startdate = '20180414'
enddate = '20180415'
예제 #17
0
import pandas as pd
import re
from process_crawler import process_crawler
from mongo_queue import MongoQueue
from mongo_cache import MongoCache
from mongo_info import MongoInfo
from downloader import Downloader

crawl_queue = MongoQueue()
crawl_queue.turn_down()
webpage_cache = MongoCache()
DEFAULT_AGENT = {}
DEFAULT_DELAY = 5
DEFAULT_RETRIES = 1
DEFAULT_TIMEOUT = 100
DEFAULT_PROXY_LIST = '/Users/apple/Desktop/connect/proxylist/proxies.csv'
DEFAULT_COOKIE = {}

D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \
        cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \
        opener=None, cache=MongoCache())


def usere(regex, getcontent):  #定义使用正则表达式的函数
    pattern = re.compile(regex)
    content = re.findall(pattern, getcontent)
    return content


#Clear Cache
# crawl_queue.clear()
예제 #18
0
class Mzitu_crawler():
    current_dir = 'E:\sunchengquan\PycharmProjects\mzitu'  #os.path.dirname(__file__)
    crawl_queue = MongoQueue('crawl_queue')
    img_queue = MongoQueue('img_queue')
    max_threads = 16
    sleep_time = 1

    def url_open(self, url, headers={}):
        """使用代理IP打开链接"""

        response = ""
        while response == "":
            try:
                print("代理ip:", self.proxy)
                response = get_page(url,
                                    proxies=self.proxy,
                                    timeout=30,
                                    options=headers)
                return response
            except:
                self.proxy = MongoClient().random()
                continue

    def pageurl_crawler(self, lock):
        while 1:
            try:
                url = self.crawl_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                img_urls = {}
                title = self.crawl_queue.pop_title(url)
                title = re.sub('[?,。;:、,.;:?!!·]', '', title)
                self.mkdir(title)
                response = requests.get(url)
                web_title = BeautifulSoup(response.text,
                                          'lxml').find('title').get_text()
                if '妹子图' in web_title:
                    max_span = BeautifulSoup(response.text, 'lxml').find(
                        'div',
                        class_='pagenavi').find_all('span')[-2].get_text()
                    lock.acquire()
                    path = self.current_dir + '\\' + title
                    for page in range(1, int(max_span) + 1):
                        page_url = url + '/' + str(page)
                        img_url = BeautifulSoup(
                            requests.get(page_url).text, 'lxml').find(
                                'div', class_='main-image').find('img')['src']
                        img_urls[img_url] = page_url
                        self.save(img_url, page_url, path)
                    self.crawl_queue.complete(url)
                    self.img_queue.push_imgurl(title, img_urls)
                    lock.release()

    def mkdir(self, path):
        path = path.strip()
        isExists = os.path.exists(os.path.join(self.current_dir, path))
        if not isExists:
            print('建了一个名字叫做', path, '的文件夹')
            os.makedirs(os.path.join(self.current_dir, path))
            return True
        else:
            print('名字叫做', path, '的文件夹已经存在了')
            return False

    def save(self, img_url, page_url, path):
        name = img_url[-9:-4]
        print('开始保存:', img_url)
        header = {'Referer': page_url}
        img = self.url_open(img_url, headers=header)
        content = img.content
        time.sleep(0.5)
        f = open(path + '\\' + name + '.jpg', 'wb')
        f.write(content)
        f.close()

    def thread_crawler(self):
        threads = []
        while threads or self.crawl_queue:
            """
            这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据
            threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行
            """
            for thread in threads:
                if not thread.is_alive():  ##is_alive是判断是否为空,不是空则在队列中删掉
                    threads.remove(thread)
            while len(threads) < self.max_threads:  ##线程池中的线程少于max_threads
                lock = threading.Lock()
                thread = threading.Thread(target=self.pageurl_crawler,
                                          args=(lock, ))  ##创建线程
                thread.setDaemon(True)  ##设置守护线程
                thread.start()  ##启动线程
                threads.append(thread)  ##添加进线程队列
            time.sleep(self.sleep_time)
예제 #19
0
# dateclick = '//*[@id="TAB_QueryConditionItem291"]'
# dyrclick = '//*[@id="TAB_queryTextItem_82"]'
# dyr = '//*[@id="TAB_queryTextItem_82"]'
# tdytclick = '//*[@id="TAB_QueryConditionItem282"]'
# tdyt = '//*[@id="TAB_queryTblEnumItem_282"]'
# searchkey = '//*[@id="TAB_QueryButtonControl"]'

#转让的Key
startkey = '//*[@id="TAB_queryDateItem_277_1"]'
endkey = '//*[@id="TAB_queryDateItem_277_2"]'
dateclick = '//*[@id="TAB_QueryConditionItem277"]'
userkey = '//*[@id="TAB_queryTextItem_275"]'
userclick = '//*[@id="TAB_QueryConditionItem275"]'
searchclick = '//*[@id="TAB_QueryButtonControl"]'

crawl_queue = MongoQueue()
browser = webdriver.Chrome()
browser.get(url)
browser.find_element_by_xpath(dateclick).click()
browser.find_element_by_xpath(userclick).click()

datelist = []
for year in range(2009, 2017):
    for month in range(1, 13):
        startday, endday = calendar.monthrange(year, month)
        datelist.append([str(year) + '-' + str(month) + '-' + str(1),\
         str(year) + '-' + str(month) + '-' + str(endday)])
for date in datelist:
    returnlist = list(pd.read_csv(CONNECT_PATH + 'waiting.csv')['url'])
    browser.find_element_by_xpath(startkey).clear()
    browser.find_element_by_xpath(endkey).clear()
예제 #20
0
def threaded_crawler(seed_url,
                     link_regex=None,
                     delay=1,
                     cache=None,
                     scrape_callback=None,
                     user_agent='Safari',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """
    多线程爬虫
    多个线程处理一个队列
    使用mongo作为队列
    """
    # crawl_queue = [seed_url]
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)

    # seen = set([seed_url])

    # 黑名单网站
    block_filename = os.path.join(BASEDIR, 'blocked_urls.txt')
    blocked_urls = [i.strip() for i in open(block_filename) if i.strip()] \
        if os.path.isfile(block_filename) else []
    # save_cache=False为测试需要
    D = Downloader(delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout,
                   cache=cache,
                   save_cache=False,
                   blocked_urls=blocked_urls)

    def process_queue():
        while 1:
            try:
                url = crawl_queue.pop()
            except (IndexError, KeyError):
                # 队列为空则停止
                break
            else:
                html = D(url) if url else None
                if html and scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                        if link_regex:
                            links.extend(link for link in get_links(html)
                                         if re.match(link_regex, link))

                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            crawl_queue.push(link)  # 入列
                            # if link not in seen:
                            #     seen.add(link)
                # print html
                # if html:
                #     # 标记为已完成
                #     crawl_queue.complete(url)
                crawl_queue.complete(url)

    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
예제 #21
0
파일: scrape.py 프로젝트: jinqihh/kg
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522  (KHTML, like Gecko) Safari/419.3",
    "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
]
headers2 = headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
headers = [headers1, headers2]
baike_queue = MongoQueue("scrawler", "baidubaike")
start_url = ["https://baike.baidu.com/item/%E5%88%98%E5%BE%B7%E5%8D%8E/114923",
        "https://baike.baidu.com/item/%E4%B8%AD%E5%9B%BD/1122445",
        "https://baike.baidu.com/item/%E9%98%BF%E5%B0%94%E4%BC%AF%E7%89%B9%C2%B7%E7%88%B1%E5%9B%A0%E6%96%AF%E5%9D%A6/127535?fromtitle=%E7%88%B1%E5%9B%A0%E6%96%AF%E5%9D%A6&fromid=122624&fr=aladdin",
        "https://baike.baidu.com/item/%E6%95%B0%E5%AD%A6/107037?fr=aladdin",
        "https://baike.baidu.com/item/%E4%BD%93%E8%82%B2",
        "https://baike.baidu.com/item/%E7%BE%8E%E9%A3%9F",
        "https://baike.baidu.com/item/%E5%8C%BB%E5%AD%A6",
        "https://baike.baidu.com/item/%E7%A7%91%E5%AD%A6%E6%8A%80%E6%9C%AF?fromtitle=%E7%A7%91%E6%8A%80&fromid=662906",
        "https://baike.baidu.com/item/%E8%8B%B9%E6%9E%9C/5670"
        ]

adapter = requests.adapters.HTTPAdapter(max_retries=20)
def start():
    if baike_queue.db.count() == 0:
        for url in start_url: