Пример #1
0
def mzitu_crawler(max_threads=10):
    crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue')

    def pageurl_crawler():
        while True:
            try:
                url = crawl_queue.pop()
                print(url)
            except KeyError:
                print("队列没有数据")
            else:
                img_urls = []
                req = request.get(url, 3).text
                title = crawl_queue.pop_title(url)
                mkdir(title)
                os.chdir("D:\mzitu\\" + title)
                max_span = BeautifulSoup(req, "lxml").find("div", class_="pagenavi").find_all("span")[-2].get_text()
                for page in range(1, int(max_span)+1):
                    page_url = url + "/" +str(page)
                    img_url = BeautifulSoup(request.get(page_url).text, "lxml").find("div", class_="main-image").find("img")["src"]
                    img_urls.append(img_url)
                    save(img_url)
                crawl_queue.complete(url)


    def save(img_url):
        name = img_url[-9:-4]
        print(u"开始保存:", img_url)
        img = request.get(img_url)
        f = open(name + ".jpg", "ab")
        f.write(img.content)
        f.close()

    def mkdir(path):
        paht = path.strip()
        isExits = os.path.exists(os.path.join("D:\mzitu\\", path))
        if not isExits:
            print(u"建了一个名字叫做", path, u"的文件夹!")
            os.makedirs(os.path.join("D:\mzitu\\", path))
            return True
        else:
            print(u"已存在名字叫做", path)
            return False

    threads = []
    while threads or crawl_queue:
        """
        这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据
        threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行
        """

        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads or crawl_queue.peek():
            thread = threading.Thread(target=pageurl_crawler)
            thread.setDaemon(True)##设置守护线程
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
Пример #2
0
def mzitu_crawler(max_threads=10):
	crawl_queue = MogoQueue('name', 'crawl_queue')
	#img_queue = MogoQueue('name', 'img_queue')
	def pageurl_crawler():
		while True:
			try:
				url = =crawl_queue.pop()
				print url
			except KeyError:
				print '队列没有数据'
				break
			else:
				img_urls = []
				req = request.get(url, 3).text()
				title = crawl_queue.pop_title(url)
				mkdir(title)
				os.chdir('D:\mzitu\\'+title)
				max_span = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
				for page in range(1, int(max_span)+1):
					page_url = url + '/' + str(page)
					img_url = BeautifulSoup(request.get(page_url, 3).text,'lxml').find('div', class_='main-image').find('img')['src']
					img_urls.append(img_url)
					save(img_url)
				crawl_queue.complete(url) 
				#img_queue.push_imgurl(title, img_urls)
				#print '插入数据库成功'

	def save(img_url):
		name = img_url[-9:-4]
		print u'开始保存:', img_url
		img = request.get(img_url, 3)
		f = open(name + '.jpg', 'ab')
		f.write(img.content)
		f.close()

	def mkdir(path):
		path = path.strip()
		isExists = os.path.exists(os.path.join('D:\mzitu',path))
		if not isExists:
			print u'建立一个名字叫做', path, u'的文件夹!'
			os.makedirs(os.path.join('D:\mzitu', path))
			return True
		else:
			print u'名字叫做', path, u'的文件夹已经存在了'
			return False

	threads = []
	while threads or crawl_queue:
		for threads in threads:
			if not thread.is_alive(): #is_alive是判断是否为空,不是空则在队列中删掉
				threads.remove(thread)
		while len(threads) < max_threads or crawl_queue.peek():
			thread = threading.Thread(target=pageurl_crawler) #创建线程
			thread.setDaemon(True) #设置守护线程
			thread.start() #启动线程
			threads.append(thread) #添加进线程队列
		time.sleep(SLEEP_TIME)
Пример #3
0
def E_Hen_crawler(max_threads=5):
    img_queue = MogoQueue('meinv', 'img_queue')

    def pageurl_crawler():
        while True:
            try:
                (url, name) = img_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                title = img_queue.pop_title_(url)
                path = str(title).replace('?', '')
                mkdir(path)
                os.chdir('C:\Data\\' + path)
                html = request.get(url, 3)
                html_soup = BeautifulSoup(html.text, 'lxml')
                img = html_soup.find('div', id='i3').find('img')
                img_url = img['src']
                print(u'得到图片的链接')
                save(img_url, name)
                img_queue.complete(url)

    def save(img_url, page_name):
        name = page_name
        print(u'开始保存:', img_url, '\n')
        img = request.get(img_url, 15)
        f = open(name + '.jpg', 'ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join("C:\Data", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("C:\Data", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or img_queue:
        for thread in threads:
            if not thread.is_alive():  ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads and img_queue.peek(
        ):  ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler)  ##创建线程
            thread.setDaemon(True)  ##设置守护线程
            thread.start()  ##启动线程
            threads.append(thread)  ##添加进线程队列
        time.sleep(SLEEP_TIME)
Пример #4
0
def mzitu_crawler(max_threads=10):
	crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue')  # 调用MogoQueue(mogodb_queue模块中)#这里是获取的url队列
	img_queue = MogoQueue('meinvxiezhenji', 'img_queue')  ##图片实际url的队列

	# 抓取页面地止
	def pageurl_crawler():
		while True:
			"""
			try....except...else(用于捕获异常)的语法:
			try:
			<语句>        #运行别的代码
			except <名字>:
			<语句>        #如果在try部份引发了'name'异常
			except <名字> as <数据>:
			<语句>        #如果引发了'name'异常,获得附加的数据
			else:
			<语句>        #如果没有异常发生

			"""
			try:
				url = crawl_queue.pop()  # MogoQueue中的pop函数,查询列队中等待抓取的对象,并改变状态
				print(url)
			except KeyError:
				print('队列没有数据')
				break
			else:
				lock = threading.Lock()
				lock.acquire()
				img_urls = []  # 创建图片地址列表备用
				req = request.get(url, 3).text  # 请求需要抓取的页面
				max_page = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()  # 获取最大页码
				title = crawl_queue.pop_title(url)  # 取出主题
				mkdir(title)  # 调用后面的mkdir函数创建名为title的文件夹
				os.chdir('F:\image\mzitu\\' + title)  # 切换到上面创建的文件夹
				for page in range(1, int(max_page) + 1):
					page_url = url + '/' + str(page)  # 构造套图中每张图片所在页面
					img_dict = BeautifulSoup(request.get(page_url, 3).text, 'lxml').find('div', {'class': 'main-image'}).find('img')
					# print(img_dict)
					if img_dict is not None:
						img_url = img_dict['src']  # 获取每张图片真实地址用于此后下载
					else:
						print(u'没有获取到img_url*******************')
					img_url_reg = re.compile('http://.*?\.jpg', re.S)  # 筛选有效图片链接的正则
					if re.match(img_url_reg, img_url):
						img_urls.append(img_url)  # 添加有效图片地址到图片地址列表
					else:
						print(u'图片不是有效地链接地址!!!!!!!!!!!!!!')
					save(img_url)  # 调用后面的save函数保存有效图片
				lock.release()  # 释放锁
				crawl_queue.complete(url)  # 设置为完成状态
				img_queue.push_imgurl(title, img_urls) #插入有效图片地址到队列
				print('插入数据库成功')

	def save(img_url):
		name = img_url[-9:-4] #获取图片名字
		print(u'开始保存:', img_url)
		img = request.get(img_url, 3) #请求图片地址
		f = open(name + '.jpg', 'ab') #创建二进制的jpg文件
		f.write(img.content) #写入文件
		f.close() #关闭文件

	def mkdir(path):
		path = path.strip() #去除空格
		isExists = os.path.exists(os.path.join('F:\image\mzitu', path)) #检查路径是否存在
		if not isExists:
			print(u'创建一个名为', path, u'的文件夹!')
			os.makedirs(os.path.join('F:\image\mzitu', path)) #创建文件夹
			return True
		else:
			print(u'文件夹已经存在!')
			return False

	threads = [] #创建线程列表备用
	while threads or crawl_queue:
		"""
		这儿用到了crawl_queue,就是__bool__函数的作用,为真则代表mongo数列里还有数据
		threads 或者 crawl_queue为真都代表还没下载完,程序继续执行
		"""
		for thread in threads:
			# is_alive判断线程是否为激活的,未激活则在队列中删掉
			if not thread.is_alive():
				threads.remove(thread)
		# 线程池中的线程小于max_threads 或者 crawl_queue中还有OUTSTANDING对象
		#peek表示取出队列中状态为OUTSTANDING的对象并返回_id(URL)
		while len(threads) < max_threads or crawl_queue.peek():
			thread = threading.Thread(target=pageurl_crawler)  ##创建线程,线程中的对象为pageurl_crawler
			thread.setDaemon(True)  ##设置守护线程
			thread.start()  ##启动线程
			threads.append(thread)  ##添加进线程队列
		time.sleep(SLEEP_TIME)
def mzitu_crawler(max_threads=10):
    crawl_queue = MogoQueue('meinvxiezhenji','crawl_queue') ##这个是我们获取URL的队列
    img_queue = MogoQueue('meinvxiezhenji','img_queue') ##这个是图片实际URL的队列
    def pageurl_crawler():
        L.acquire()
        while True:
            try:
                url = crawl_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                img_urls = []
                req = request.get(url, 3).text
                title = crawl_queue.pop_title(url)
                path = str(title).replace('?', '')##测试过程中发现一个标题有问号
                path = re.sub(r'[?\\*|“<>:/]', '', str(path))
                mkdir(path)
                os.chdir('E:\图片\mzitu\\' + path)
                max_span = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
                for page in range(1, int(max_span) + 1):
                    page_url = url + '/' + str(page)
                    img_html = request.get(page_url, 3)  ##这儿更改了一下(是不是发现  self 没见了?)
                    img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
                    name = BeautifulSoup(img_html.text, 'lxml').find('h2', class_='main-title').get_text()
                    name = re.sub(r'[?\\*|“<>:/]', '', str(name))
                    img_urls.append(img_url)
                    print(u'开始保存:', img_url,name)
                    img = request.get(img_url, 3, referer=page_url)
                    f = open(name + '.jpg', 'ab')
                    f.write(img.content)
                    f.close()
                crawl_queue.complete(url) ##设置为完成状态
                img_queue.push_imgurl(title, img_urls)
                print('插入数据库成功')
        L.release()


    def mkdir(path):
        path = path.strip()
        path = re.sub(r'[?\\*|“<>:/]', '', str(path))
        isExists = os.path.exists(os.path.join("E:\图片\mzitu", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("E:\图片\mzitu", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or crawl_queue:
        """
        这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据
        threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行
        """
        for thread in threads:
            if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads or crawl_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler) ##创建线程
            thread.setDaemon(True) ##设置守护线程
            thread.start() ##启动线程
            threads.append(thread) ##添加进线程队列
        time.sleep(SLEEP_TIME)
Пример #6
0
def E_Hen_crawler(max_threads=5):
    img_queue = MogoQueue('meinv', 'xiumm')
    def pageurl_crawler():
        while True:
            try:
                (url,name) = img_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                lock.acquire()
                title = img_queue.pop_title_(url)
                path = str(title).replace('?', '')
                mkdir(path)
                os.chdir('E:\E-Hen\\' + path)
                response = request.get(url, 3)
                response.encoding = 'utf-8'
                Soup = BeautifulSoup(response.text, 'lxml')
                all_url = Soup.find('div', class_='gallary_wrap').find_all('td')
                max_span = Soup.find('div', class_='paginator').find_all('a')
                for td in all_url:
                    href = s + td.img['src']
                    name = td.img['alt'].strip()[-3:]
                    save(href, name)
                for page in max_span:
                    page_url = s + page['href']
                    html = request.get(page_url, 3)
                    Soup = BeautifulSoup(html.text, 'lxml')
                    all_td = Soup.find('div', class_='gallary_wrap').find_all('td')
                    for td2 in all_td:
                        href2 = s + td2.img['src']
                        name2 = td2.img['alt'].strip()[-3:]
                        save(href2, name2)
                img_queue.complete(url)
                lock.release()

    def save(img_url,page_name):
        name=page_name
        print(u'开始保存:', img_url,'\n')
        img=request.get(img_url,15)
        f=open(name+'.jpg','ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join("E:\E-Hen", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("E:\E-Hen", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or img_queue:
        for thread in threads:
            if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads and img_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler) ##创建线程
            thread.setDaemon(True) ##设置守护线程
            thread.start() ##启动线程
            threads.append(thread) ##添加进线程队列
        time.sleep(SLEEP_TIME)
def mypics_crawler(max_threads=10):
    crawl_queue = MogoQueue('mypics_db', 'crawl_queue')  ##这个是我们获取URL的队列
    img_queue = MogoQueue('mypics_db', 'img_queue')  ##这个是图片实际URL的队列

    def pageurl_crawler():
        while True:
            try:
                url = crawl_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                img_urls = []
                req = request.get(url, 3).text
                title = crawl_queue.pop_title(url)
                path = str(title).replace('?', '')  ##测试过程中发现一个标题有问号
                mkdir(path)
                os.chdir('D:\my_pics\\' + path)
                #找到最大的图片页码
                max_span = BeautifulSoup(req, 'lxml').find(
                    'div', class_='pages').find_all('a')[0].get_text()
                max_span = re.findall(r"\d+\.?\d*", max_span)
                print(max_span)
                #保存每一张图片
                for page in range(2,
                                  int(max_span[0]) +
                                  1):  #第二页都是从序号2开始的,第一页没序号,先略过
                    page_url = url.rstrip('.html') + '_' + str(
                        page) + '.html'  #每一页的网址
                    print('图片网址:', page_url)
                    #每一页的图片地址
                    img_url = BeautifulSoup(
                        request.get(page_url, 3).text,
                        'lxml').find('div',
                                     class_='big-pic').find('img')['src']
                    img_urls.append(img_url)

                    lock.acquire()
                    os.chdir('D:\my_pics\\' + path)
                    save(img_url)
                    lock.release()

                crawl_queue.complete(url)  ##设置为完成状态
                img_queue.push_imgurl(title, img_urls)
                print('合集图片urls插入数据库成功')

    def save(img_url):
        name = img_url[-9:-4]
        print(u'开始保存:', img_url)
        img = request.get(img_url, 3)
        f = open(name + '.jpg', 'ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join("D:\my_pics", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("D:\my_pics", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or crawl_queue:
        """
        这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据
        threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行
        """
        for thread in threads:
            if not thread.is_alive():  ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek(
        ):  ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler)  ##创建线程
            thread.setDaemon(True)  ##设置守护线程
            thread.start()  ##启动线程
            threads.append(thread)  ##添加进线程队列
        time.sleep(SLEEP_TIME)
Пример #8
0
def E_Hen_crawler(max_threads=4):
    jinji = MogoQueue('meinv', 'jinji')

    def pageurl_crawler():
        while True:
            try:
                (url, name) = jinji.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                lock.acquire()
                img_urls = []
                html = request.get(url, 10)
                title = jinji.pop_title_(url)
                mkdir(title)
                os.chdir('E:\E-Hen\\' + title)
                html.encoding = html.apparent_encoding
                html_soup = BeautifulSoup(html.text, 'lxml')
                max_span = html_soup.find_all('table')[1].find_all(
                    'option')[-1].get_text()[2:4]
                for page in range(1, int(max_span) + 1):
                    if page < 10:
                        page_url = url[:-6] + str(page) + '.html'
                    else:
                        page_url = url[:-7] + str(page) + '.html'
                    page_html = request.get(page_url, 10)
                    page_html.encoding = page_html.apparent_encoding
                    pattern = re.compile(
                        '<a href=".*?"><img src="(.*?)" border="0".*?oncontextmenu=.*?'
                    )
                    img_url = re.findall(pattern, page_html.text)[0]
                    img_urls.append(img_url)
                    print(u'得到图片的链接')
                    save(img_url)
                jinji.complete(url)
                lock.release()

    def save(img_url):
        name = img_url[-7:]
        print(u'开始保存:', img_url, '\n')
        img = request.get(img_url, 15)
        f = open(name, 'ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join("E:\E-Hen", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("E:\E-Hen", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or jinji:
        for thread in threads:
            if not thread.is_alive():  ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads and jinji.peek(
        ):  ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler)  ##创建线程
            thread.setDaemon(True)  ##设置守护线程
            thread.start()  ##启动线程
            threads.append(thread)  ##添加进线程队列
        time.sleep(SLEEP_TIME)
Пример #9
0
def mzitu_crawler(max_threads=10):
    crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue')

    def pageurl_crawl(lock):
        while True:
            try:
                url = crawl_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                img_urls = []
                req = request.get(url, 3).text
                title = crawl_queue.pop_title(url)
                mkdir(title)
                with lock:
                    os.chdir('F:\mzitu\\' + title)
                    max_page = BeautifulSoup(req, 'lxml').find(
                        'div',
                        class_='pagenavi').find_all('span')[-2].get_text()
                    for page in range(1, int(max_page) + 1):
                        page_url = url + '/' + str(page)
                        img_url = BeautifulSoup(
                            request.get(page_url, 3).text,
                            'lxml').find('div', {
                                'class': 'main-image'
                            }).find('img')
                        img_urls.append(img_url)
                        save(img_url)

    def save(img_url):
        name = img_url[-9:-4]
        print(u'开始保存:', img_url)
        img = request.get(img_url, 3)
        f = open(name + '.jpg', 'ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join('F:\mzitu', path))
        if not isExists:
            print(u'创建一个名为', path, u'的文件夹!')
            os.makedirs(os.path.join('F:\mzitu', path))
            return True
        else:
            print(u'文件夹已经存在!')
            return False

    threads = []
    while threads or crawl_queue:
        """
		这儿用到了crawl_queue,就是__bool__函数的作用,为真则代表mongo数列里还有数据
		threads 或者 crawl_queue为真都代表还没下载完,程序继续执行
		"""
        for thread in threads:
            # is_alive判断是否为空,不是空则在队列中删掉
            if not thread.is_alive():
                threads.remove(thread)
                # 线程池中的线程小于max_threads 或者 crawl_queue
        while len(threads) < max_threads or crawl_queue.peek():
            thread = threading.Thread(target=pageurl_crawl)  ##创建线程
            thread.setDaemon(True)  ##设置守护线程
            thread.start()  ##启动线程
            threads.append(thread)  ##添加进线程队列
        time.sleep(SLEEP_TIME)
Пример #10
0
def E_Hen_crawler(max_threads=5):
    crawl_queue = MogoQueue('meinv', 'crawl_queue') ##这个是我们获取URL的队列
    img_queue = MogoQueue('meinv', 'img_queue')
    def pageurl_crawler():
        while True:
            try:
                url = crawl_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                title = crawl_queue.pop_title(url)
                path = str(title).replace('?', '') ##测试过程中发现一个标题有问号
                mkdir(path)
                os.chdir('E:\Cover\\' + path)
                html = request.get(url, 3)
                Soup = BeautifulSoup(html.text, 'lxml')
                all_a = Soup.find('div', id='gdt').find_all('a')
                for a in all_a:
                    href = a['href']
                    name = a.img['alt']
                    html = request.get(href, 3)
                    html_soup = BeautifulSoup(html.text, 'lxml')
                    img = html_soup.find('div', id='i3').find('img')
                    img_url = img['src']
                    img_queue.push_imgurl(title,img_url)
                    print(u'得到图片的链接')
                    save(img_url, name)
                    img_queue.complete(img_url)
                crawl_queue.complete(url)

    def save(img_url,page_name):
        name=page_name
        print(u'开始保存:', img_url,'\n')
        img=request.get(img_url,3)
        f=open(name+'.jpg','ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join("E:\Cover", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("E:\Cover", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or crawl_queue:
        """
        这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据
        threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行
        """
        for thread in threads:
            if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler) ##创建线程
            thread.setDaemon(True) ##设置守护线程
            thread.start() ##启动线程
            threads.append(thread) ##添加进线程队列
        time.sleep(SLEEP_TIME)