def mzitu_crawler(max_threads=10):
    crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue')

    def pageurl_crawler():
        while True:
            try:
                url = crawl_queue.pop()
                print(url)
            except KeyError:
                print("队列没有数据")
            else:
                img_urls = []
                req = request.get(url, 3).text
                title = crawl_queue.pop_title(url)
                mkdir(title)
                os.chdir("D:\mzitu\\" + title)
                max_span = BeautifulSoup(req, "lxml").find("div", class_="pagenavi").find_all("span")[-2].get_text()
                for page in range(1, int(max_span)+1):
                    page_url = url + "/" +str(page)
                    img_url = BeautifulSoup(request.get(page_url).text, "lxml").find("div", class_="main-image").find("img")["src"]
                    img_urls.append(img_url)
                    save(img_url)
                crawl_queue.complete(url)


    def save(img_url):
        name = img_url[-9:-4]
        print(u"开始保存:", img_url)
        img = request.get(img_url)
        f = open(name + ".jpg", "ab")
        f.write(img.content)
        f.close()

    def mkdir(path):
        paht = path.strip()
        isExits = os.path.exists(os.path.join("D:\mzitu\\", path))
        if not isExits:
            print(u"建了一个名字叫做", path, u"的文件夹!")
            os.makedirs(os.path.join("D:\mzitu\\", path))
            return True
        else:
            print(u"已存在名字叫做", path)
            return False

    threads = []
    while threads or crawl_queue:
        """
        这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据
        threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行
        """

        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads or crawl_queue.peek():
            thread = threading.Thread(target=pageurl_crawler)
            thread.setDaemon(True)##设置守护线程
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
示例#2
0
def startDown(url,rule,num,start,decoding=None):
    if not decoding:
        decoding='utf8'
    #req=urllib.request.urlopen(url)
    #response= request.get(url, 3)
    #body=response.text #req.read().decode(decoding)
    
    print('file='+url)
    f = open(url)
    body = f.read()
    f.close()
    debs = body.split('\n')

    #rule=re.compile(rule)
    #debs=rule.findall(body)
    crawl_queue = MogoQueue('cetc15-apt', 'crawl_queue')   
    #crawl_queue.clear() # CCCCC
    for l in debs:
        l = l.strip()
        if (len(l)==0 or not l.startswith(PREFIX_STR)):
            continue
        print 'deb:'  + l[start:]
        #TODO: get the sha1
        crawl_queue.push(l[start:], 'a')
    for i in range(num):
        d=download(crawl_queue)
        d.start()
示例#3
0
def mzitu_crawler(max_threads=10):
	crawl_queue = MogoQueue('name', 'crawl_queue')
	#img_queue = MogoQueue('name', 'img_queue')
	def pageurl_crawler():
		while True:
			try:
				url = =crawl_queue.pop()
				print url
			except KeyError:
				print '队列没有数据'
				break
			else:
				img_urls = []
				req = request.get(url, 3).text()
				title = crawl_queue.pop_title(url)
				mkdir(title)
				os.chdir('D:\mzitu\\'+title)
				max_span = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
				for page in range(1, int(max_span)+1):
					page_url = url + '/' + str(page)
					img_url = BeautifulSoup(request.get(page_url, 3).text,'lxml').find('div', class_='main-image').find('img')['src']
					img_urls.append(img_url)
					save(img_url)
				crawl_queue.complete(url) 
				#img_queue.push_imgurl(title, img_urls)
				#print '插入数据库成功'

	def save(img_url):
		name = img_url[-9:-4]
		print u'开始保存:', img_url
		img = request.get(img_url, 3)
		f = open(name + '.jpg', 'ab')
		f.write(img.content)
		f.close()

	def mkdir(path):
		path = path.strip()
		isExists = os.path.exists(os.path.join('D:\mzitu',path))
		if not isExists:
			print u'建立一个名字叫做', path, u'的文件夹!'
			os.makedirs(os.path.join('D:\mzitu', path))
			return True
		else:
			print u'名字叫做', path, u'的文件夹已经存在了'
			return False

	threads = []
	while threads or crawl_queue:
		for threads in threads:
			if not thread.is_alive(): #is_alive是判断是否为空,不是空则在队列中删掉
				threads.remove(thread)
		while len(threads) < max_threads or crawl_queue.peek():
			thread = threading.Thread(target=pageurl_crawler) #创建线程
			thread.setDaemon(True) #设置守护线程
			thread.start() #启动线程
			threads.append(thread) #添加进线程队列
		time.sleep(SLEEP_TIME)
示例#4
0
def E_Hen_crawler(max_threads=5):
    img_queue = MogoQueue('meinv', 'img_queue')

    def pageurl_crawler():
        while True:
            try:
                (url, name) = img_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                title = img_queue.pop_title_(url)
                path = str(title).replace('?', '')
                mkdir(path)
                os.chdir('C:\Data\\' + path)
                html = request.get(url, 3)
                html_soup = BeautifulSoup(html.text, 'lxml')
                img = html_soup.find('div', id='i3').find('img')
                img_url = img['src']
                print(u'得到图片的链接')
                save(img_url, name)
                img_queue.complete(url)

    def save(img_url, page_name):
        name = page_name
        print(u'开始保存:', img_url, '\n')
        img = request.get(img_url, 15)
        f = open(name + '.jpg', 'ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join("C:\Data", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("C:\Data", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or img_queue:
        for thread in threads:
            if not thread.is_alive():  ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads and img_queue.peek(
        ):  ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler)  ##创建线程
            thread.setDaemon(True)  ##设置守护线程
            thread.start()  ##启动线程
            threads.append(thread)  ##添加进线程队列
        time.sleep(SLEEP_TIME)
示例#5
0
def start(url):
    response = request.get(url, 3)
    Soup = BeautifulSoup(response.text, 'lxml')
    title = Soup.find('div', class_='gm').find('h1', id='gj').get_text()
    spider_queue = MogoQueue('meinv', 'img_queue')
    spider_queue.clear()
    print(u'清除集合img_queue')
    spider_queue = MogoQueue('meinv', 'img_queue')
    print(u'新建集合img_queue')
    max_span = Soup.find('table', class_='ptt').find_all('td')[-2].get_text()
    page_url = url
    for i in range(1,int(max_span)+1):
        html = request.get(page_url, 3)
        Soup = BeautifulSoup(html.text, 'lxml')
        all_a = Soup.find('div', id='gdt').find_all('a')
        for a in all_a:
            href = a['href']
            name = a.img['alt']
            spider_queue.push(href, title,name)
        page_url =url+'?p='+str(i)
示例#6
0
# -*- coding: utf-8 -*-
#import urllib2
#import urllib
from mongodb_queue import MogoQueue
from Download import request
from bs4 import BeautifulSoup
'''
def getHtml():
	page = urllib.urlopen("http://www.chinalawedu.com/falvfagui/")
	html = page.read()
	reg=r'class="fenlei_txt"'
	soup=BeautifulSoup(html,"lxml")
'''

law_queue = MogoQueue('falvfagui', 'title_queue')


def start(url):
    response = request.get(url, 3)
    soup = BeautifulSoup(response.text, 'html.parser')
    #print soup.prettify().encode('utf-8')
    all_div = soup.find_all('div', class_="fenlei_txt")

    #law_queue.push('lianjie','ok')

    for div in all_div:
        #print div.prettify()
        all_a = div.find_all('a')
        for a in all_a:
            title = a.get_text()
            url = a['href']
示例#7
0
# encoding=utf-8

from Download import request
from mongodb_queue import MogoQueue
from bs4 import BeautifulSoup

spider_queue = MogoQueue('name', 'crawl_queue')
def start(url):
	response = request.get(url, 3)
	Soup = BeautifulSoup(response.text, 'lxml')
	all_a = Soup.find('div', class_='all').find_all('a')
	for a in all_a:
		title = a.get_text()
		url = a['href']
		spider_queue.push(url, title)

if __name__ == '__main__':
	start('http://www.mzitu.com/all')
from downloader import request
from mongodb_queue import MogoQueue
from bs4 import BeautifulSoup

spider_queue = MogoQueue("meinvxiezhenji", "crawl_queue")


def start(url):
    response = request.get(url)
    Soup = BeautifulSoup(response.text, "lxml")
    all_a = Soup.find("div", class_="all").find("ul").find_all("a")
    for a in all_a:
        title = a.get_text()
        url = a["href"]
        spider_queue.push(url, title)
    """上面这个调用就是把Url写入MongoDB的队列"""


if __name__ == "__main__":
    start("http://www.mzitu.com/all")
示例#9
0
from Download import request
from mongodb_queue import MogoQueue
from bs4 import BeautifulSoup

spider_queue = MogoQueue('picture', 'jinji')


def start(url):
    response = request.get(url, 3)
    response.encoding = response.apparent_encoding
    Soup = BeautifulSoup(response.text, 'lxml')
    all_a = Soup.find_all('fieldset', id='info')[1].find_all('a')
    for a in all_a:
        title = a.get_text().strip()
        url = 'http://www.cartoonmad.com' + a['href']
        print(title, url)
        spider_queue.push(url, title, 1)


if __name__ == "__main__":
    start('http://www.cartoonmad.com/comic/1221.html')
def mzitu_crawler(max_threads=10):
    crawl_queue = MogoQueue('meinvxiezhenji','crawl_queue') ##这个是我们获取URL的队列
    img_queue = MogoQueue('meinvxiezhenji','img_queue') ##这个是图片实际URL的队列
    def pageurl_crawler():
        L.acquire()
        while True:
            try:
                url = crawl_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                img_urls = []
                req = request.get(url, 3).text
                title = crawl_queue.pop_title(url)
                path = str(title).replace('?', '')##测试过程中发现一个标题有问号
                path = re.sub(r'[?\\*|“<>:/]', '', str(path))
                mkdir(path)
                os.chdir('E:\图片\mzitu\\' + path)
                max_span = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
                for page in range(1, int(max_span) + 1):
                    page_url = url + '/' + str(page)
                    img_html = request.get(page_url, 3)  ##这儿更改了一下(是不是发现  self 没见了?)
                    img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
                    name = BeautifulSoup(img_html.text, 'lxml').find('h2', class_='main-title').get_text()
                    name = re.sub(r'[?\\*|“<>:/]', '', str(name))
                    img_urls.append(img_url)
                    print(u'开始保存:', img_url,name)
                    img = request.get(img_url, 3, referer=page_url)
                    f = open(name + '.jpg', 'ab')
                    f.write(img.content)
                    f.close()
                crawl_queue.complete(url) ##设置为完成状态
                img_queue.push_imgurl(title, img_urls)
                print('插入数据库成功')
        L.release()


    def mkdir(path):
        path = path.strip()
        path = re.sub(r'[?\\*|“<>:/]', '', str(path))
        isExists = os.path.exists(os.path.join("E:\图片\mzitu", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("E:\图片\mzitu", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or crawl_queue:
        """
        这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据
        threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行
        """
        for thread in threads:
            if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads or crawl_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler) ##创建线程
            thread.setDaemon(True) ##设置守护线程
            thread.start() ##启动线程
            threads.append(thread) ##添加进线程队列
        time.sleep(SLEEP_TIME)
def mypics_crawler(max_threads=10):
    crawl_queue = MogoQueue('mypics_db', 'crawl_queue')  ##这个是我们获取URL的队列
    img_queue = MogoQueue('mypics_db', 'img_queue')  ##这个是图片实际URL的队列

    def pageurl_crawler():
        while True:
            try:
                url = crawl_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                img_urls = []
                req = request.get(url, 3).text
                title = crawl_queue.pop_title(url)
                path = str(title).replace('?', '')  ##测试过程中发现一个标题有问号
                mkdir(path)
                os.chdir('D:\my_pics\\' + path)
                #找到最大的图片页码
                max_span = BeautifulSoup(req, 'lxml').find(
                    'div', class_='pages').find_all('a')[0].get_text()
                max_span = re.findall(r"\d+\.?\d*", max_span)
                print(max_span)
                #保存每一张图片
                for page in range(2,
                                  int(max_span[0]) +
                                  1):  #第二页都是从序号2开始的,第一页没序号,先略过
                    page_url = url.rstrip('.html') + '_' + str(
                        page) + '.html'  #每一页的网址
                    print('图片网址:', page_url)
                    #每一页的图片地址
                    img_url = BeautifulSoup(
                        request.get(page_url, 3).text,
                        'lxml').find('div',
                                     class_='big-pic').find('img')['src']
                    img_urls.append(img_url)

                    lock.acquire()
                    os.chdir('D:\my_pics\\' + path)
                    save(img_url)
                    lock.release()

                crawl_queue.complete(url)  ##设置为完成状态
                img_queue.push_imgurl(title, img_urls)
                print('合集图片urls插入数据库成功')

    def save(img_url):
        name = img_url[-9:-4]
        print(u'开始保存:', img_url)
        img = request.get(img_url, 3)
        f = open(name + '.jpg', 'ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join("D:\my_pics", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("D:\my_pics", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or crawl_queue:
        """
        这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据
        threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行
        """
        for thread in threads:
            if not thread.is_alive():  ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek(
        ):  ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler)  ##创建线程
            thread.setDaemon(True)  ##设置守护线程
            thread.start()  ##启动线程
            threads.append(thread)  ##添加进线程队列
        time.sleep(SLEEP_TIME)
示例#12
0
from Download import request
from mongodb_queue import MogoQueue
from bs4 import BeautifulSoup

spider_queue = MogoQueue('mypics_db', 'crawl_queue')


def start(url):
    response = request.get(url, 3)
    Soup = BeautifulSoup(response.text, 'lxml')
    all_div = Soup.find_all('div', class_='item masonry_brick masonry-brick')
    for div in all_div:
        a = div.find_all('a')
        title = a[0]['href'][-10:-1]  #a[1].get_text()
        url = a[0]['href']
        spider_queue.push(url, title)
    """上面这个调用就是把URL写入MongoDB的队列了"""


if __name__ == "__main__":
    start('http://www.mmonly.cc/gqbz/mnbz/')
    #spider_queue.clear()
"""这一段儿就不解释了哦!超级简单的"""
示例#13
0
def E_Hen_crawler(max_threads=4):
    jinji = MogoQueue('meinv', 'jinji')

    def pageurl_crawler():
        while True:
            try:
                (url, name) = jinji.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                lock.acquire()
                img_urls = []
                html = request.get(url, 10)
                title = jinji.pop_title_(url)
                mkdir(title)
                os.chdir('E:\E-Hen\\' + title)
                html.encoding = html.apparent_encoding
                html_soup = BeautifulSoup(html.text, 'lxml')
                max_span = html_soup.find_all('table')[1].find_all(
                    'option')[-1].get_text()[2:4]
                for page in range(1, int(max_span) + 1):
                    if page < 10:
                        page_url = url[:-6] + str(page) + '.html'
                    else:
                        page_url = url[:-7] + str(page) + '.html'
                    page_html = request.get(page_url, 10)
                    page_html.encoding = page_html.apparent_encoding
                    pattern = re.compile(
                        '<a href=".*?"><img src="(.*?)" border="0".*?oncontextmenu=.*?'
                    )
                    img_url = re.findall(pattern, page_html.text)[0]
                    img_urls.append(img_url)
                    print(u'得到图片的链接')
                    save(img_url)
                jinji.complete(url)
                lock.release()

    def save(img_url):
        name = img_url[-7:]
        print(u'开始保存:', img_url, '\n')
        img = request.get(img_url, 15)
        f = open(name, 'ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join("E:\E-Hen", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("E:\E-Hen", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or jinji:
        for thread in threads:
            if not thread.is_alive():  ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads and jinji.peek(
        ):  ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler)  ##创建线程
            thread.setDaemon(True)  ##设置守护线程
            thread.start()  ##启动线程
            threads.append(thread)  ##添加进线程队列
        time.sleep(SLEEP_TIME)
示例#14
0
def mzitu_crawler(max_threads=10):
    crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue')

    def pageurl_crawl(lock):
        while True:
            try:
                url = crawl_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                img_urls = []
                req = request.get(url, 3).text
                title = crawl_queue.pop_title(url)
                mkdir(title)
                with lock:
                    os.chdir('F:\mzitu\\' + title)
                    max_page = BeautifulSoup(req, 'lxml').find(
                        'div',
                        class_='pagenavi').find_all('span')[-2].get_text()
                    for page in range(1, int(max_page) + 1):
                        page_url = url + '/' + str(page)
                        img_url = BeautifulSoup(
                            request.get(page_url, 3).text,
                            'lxml').find('div', {
                                'class': 'main-image'
                            }).find('img')
                        img_urls.append(img_url)
                        save(img_url)

    def save(img_url):
        name = img_url[-9:-4]
        print(u'开始保存:', img_url)
        img = request.get(img_url, 3)
        f = open(name + '.jpg', 'ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join('F:\mzitu', path))
        if not isExists:
            print(u'创建一个名为', path, u'的文件夹!')
            os.makedirs(os.path.join('F:\mzitu', path))
            return True
        else:
            print(u'文件夹已经存在!')
            return False

    threads = []
    while threads or crawl_queue:
        """
		这儿用到了crawl_queue,就是__bool__函数的作用,为真则代表mongo数列里还有数据
		threads 或者 crawl_queue为真都代表还没下载完,程序继续执行
		"""
        for thread in threads:
            # is_alive判断是否为空,不是空则在队列中删掉
            if not thread.is_alive():
                threads.remove(thread)
                # 线程池中的线程小于max_threads 或者 crawl_queue
        while len(threads) < max_threads or crawl_queue.peek():
            thread = threading.Thread(target=pageurl_crawl)  ##创建线程
            thread.setDaemon(True)  ##设置守护线程
            thread.start()  ##启动线程
            threads.append(thread)  ##添加进线程队列
        time.sleep(SLEEP_TIME)
示例#15
0
def E_Hen_crawler(max_threads=5):
    img_queue = MogoQueue('meinv', 'xiumm')
    def pageurl_crawler():
        while True:
            try:
                (url,name) = img_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                lock.acquire()
                title = img_queue.pop_title_(url)
                path = str(title).replace('?', '')
                mkdir(path)
                os.chdir('E:\E-Hen\\' + path)
                response = request.get(url, 3)
                response.encoding = 'utf-8'
                Soup = BeautifulSoup(response.text, 'lxml')
                all_url = Soup.find('div', class_='gallary_wrap').find_all('td')
                max_span = Soup.find('div', class_='paginator').find_all('a')
                for td in all_url:
                    href = s + td.img['src']
                    name = td.img['alt'].strip()[-3:]
                    save(href, name)
                for page in max_span:
                    page_url = s + page['href']
                    html = request.get(page_url, 3)
                    Soup = BeautifulSoup(html.text, 'lxml')
                    all_td = Soup.find('div', class_='gallary_wrap').find_all('td')
                    for td2 in all_td:
                        href2 = s + td2.img['src']
                        name2 = td2.img['alt'].strip()[-3:]
                        save(href2, name2)
                img_queue.complete(url)
                lock.release()

    def save(img_url,page_name):
        name=page_name
        print(u'开始保存:', img_url,'\n')
        img=request.get(img_url,15)
        f=open(name+'.jpg','ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join("E:\E-Hen", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("E:\E-Hen", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or img_queue:
        for thread in threads:
            if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads and img_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler) ##创建线程
            thread.setDaemon(True) ##设置守护线程
            thread.start() ##启动线程
            threads.append(thread) ##添加进线程队列
        time.sleep(SLEEP_TIME)
示例#16
0
# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import os
from mongodb_queue import MogoQueue
headers = {
    'User-Agent':
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
}
start_url = input('请输入您要获取的漫画的首地址:')
P_url = start_url.rsplit('/', 1)[0] + '/'
#地址前缀
spider_queue = MogoQueue('acg', 'sex_cool')


def mkdir(title):
    isExists = os.path.exists(os.path.join("/home/virgil/图片/adult_only",
                                           title))
    if not isExists:
        print(u'建立了一个名字叫做', title, u'的文件夹')
        os.makedirs(os.path.join("/home/virgil/图片/adult_only", title))
        os.chdir("/home/virgil/图片/adult_only/" + title)
        return True
    else:
        print(u'名字叫做', title, u'的文件夹已经存在了!')
        os.chdir("/home/virgil/图片/adult_only/" + title)
        return False


def acg():
    global headers
示例#17
0
from Download import request
from mongodb_queue import MogoQueue
from bs4 import BeautifulSoup

spider_queue = MogoQueue('meinv', 'xiumm')
# spider_queue.clear()
# print(u'清除集合xiumm')
# spider_queue = MogoQueue('meinv', 'xiumm')
# print(u'新建集合xiumm')
s='http://www.xiumm.org/'
i=1

def start(url):
    response = request.get(url, 3)
    Soup = BeautifulSoup(response.text, 'lxml')

    max_span = Soup.find('div', class_='paginator').find_all('a')
    page_url = s
    for page in max_span[0:1]:
        html = request.get(page_url, 3)
        html.encoding = 'utf-8'
        Soup = BeautifulSoup(html.text, 'lxml')
        all_td = Soup.find('div', class_='gallary_wrap').find_all('td')
        for td in all_td:
            address = td.a['href']
            title = td.a.img['alt']
            spider_queue.push(address, title, i)
        page_url = s + page['href']
        i=i+1

# def get_url(address):
示例#18
0
from Download import request
from mongodb_queue import MogoQueue
from bs4 import BeautifulSoup
spider_queue = MogoQueue('meinvxiezhenji', 'crawl_queue')


def start(url):
    response = request.get(url, 3)
    Soup = BeautifulSoup(response.text, 'lxml')
    all_a = Soup.find('div', class_='all').find_all('a')
    for a in all_a[1:]:
        title = a.get_text()
        url = a['href']
        spider_queue.push(url, title)
    """上面这个调用就是把URL写入MongoDB的队列了"""


if __name__ == "__main__":
    start('http://www.mzitu.com/all')
示例#19
0
from Download import request
from mongodb_queue import MogoQueue
from bs4 import BeautifulSoup

spider_queue = MogoQueue('meinv', 'crawl_queue')


def start(url):
    response = request.get(url, 3)
    Soup = BeautifulSoup(response.text, 'lxml')
    title = Soup.find('div', class_='gm').find('h1', id='gj').get_text()
    max_span = Soup.find('table', class_='ptt').find_all('td')
    i = 1
    for page in max_span[1:-1]:
        page_url = page.a['href']
        i += 1
        spider_queue.push(page_url, title)


if __name__ == "__main__":
    start('https://e-hentai.org/g/358581/e6db8cb4b9/')
示例#20
0
def mzitu_crawler(max_threads=10):
	crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue')  # 调用MogoQueue(mogodb_queue模块中)#这里是获取的url队列
	img_queue = MogoQueue('meinvxiezhenji', 'img_queue')  ##图片实际url的队列

	# 抓取页面地止
	def pageurl_crawler():
		while True:
			"""
			try....except...else(用于捕获异常)的语法:
			try:
			<语句>        #运行别的代码
			except <名字>:
			<语句>        #如果在try部份引发了'name'异常
			except <名字> as <数据>:
			<语句>        #如果引发了'name'异常,获得附加的数据
			else:
			<语句>        #如果没有异常发生

			"""
			try:
				url = crawl_queue.pop()  # MogoQueue中的pop函数,查询列队中等待抓取的对象,并改变状态
				print(url)
			except KeyError:
				print('队列没有数据')
				break
			else:
				lock = threading.Lock()
				lock.acquire()
				img_urls = []  # 创建图片地址列表备用
				req = request.get(url, 3).text  # 请求需要抓取的页面
				max_page = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()  # 获取最大页码
				title = crawl_queue.pop_title(url)  # 取出主题
				mkdir(title)  # 调用后面的mkdir函数创建名为title的文件夹
				os.chdir('F:\image\mzitu\\' + title)  # 切换到上面创建的文件夹
				for page in range(1, int(max_page) + 1):
					page_url = url + '/' + str(page)  # 构造套图中每张图片所在页面
					img_dict = BeautifulSoup(request.get(page_url, 3).text, 'lxml').find('div', {'class': 'main-image'}).find('img')
					# print(img_dict)
					if img_dict is not None:
						img_url = img_dict['src']  # 获取每张图片真实地址用于此后下载
					else:
						print(u'没有获取到img_url*******************')
					img_url_reg = re.compile('http://.*?\.jpg', re.S)  # 筛选有效图片链接的正则
					if re.match(img_url_reg, img_url):
						img_urls.append(img_url)  # 添加有效图片地址到图片地址列表
					else:
						print(u'图片不是有效地链接地址!!!!!!!!!!!!!!')
					save(img_url)  # 调用后面的save函数保存有效图片
				lock.release()  # 释放锁
				crawl_queue.complete(url)  # 设置为完成状态
				img_queue.push_imgurl(title, img_urls) #插入有效图片地址到队列
				print('插入数据库成功')

	def save(img_url):
		name = img_url[-9:-4] #获取图片名字
		print(u'开始保存:', img_url)
		img = request.get(img_url, 3) #请求图片地址
		f = open(name + '.jpg', 'ab') #创建二进制的jpg文件
		f.write(img.content) #写入文件
		f.close() #关闭文件

	def mkdir(path):
		path = path.strip() #去除空格
		isExists = os.path.exists(os.path.join('F:\image\mzitu', path)) #检查路径是否存在
		if not isExists:
			print(u'创建一个名为', path, u'的文件夹!')
			os.makedirs(os.path.join('F:\image\mzitu', path)) #创建文件夹
			return True
		else:
			print(u'文件夹已经存在!')
			return False

	threads = [] #创建线程列表备用
	while threads or crawl_queue:
		"""
		这儿用到了crawl_queue,就是__bool__函数的作用,为真则代表mongo数列里还有数据
		threads 或者 crawl_queue为真都代表还没下载完,程序继续执行
		"""
		for thread in threads:
			# is_alive判断线程是否为激活的,未激活则在队列中删掉
			if not thread.is_alive():
				threads.remove(thread)
		# 线程池中的线程小于max_threads 或者 crawl_queue中还有OUTSTANDING对象
		#peek表示取出队列中状态为OUTSTANDING的对象并返回_id(URL)
		while len(threads) < max_threads or crawl_queue.peek():
			thread = threading.Thread(target=pageurl_crawler)  ##创建线程,线程中的对象为pageurl_crawler
			thread.setDaemon(True)  ##设置守护线程
			thread.start()  ##启动线程
			threads.append(thread)  ##添加进线程队列
		time.sleep(SLEEP_TIME)
示例#21
0
def E_Hen_crawler(max_threads=5):
    crawl_queue = MogoQueue('meinv', 'crawl_queue') ##这个是我们获取URL的队列
    img_queue = MogoQueue('meinv', 'img_queue')
    def pageurl_crawler():
        while True:
            try:
                url = crawl_queue.pop()
                print(url)
            except KeyError:
                print('队列没有数据')
                break
            else:
                title = crawl_queue.pop_title(url)
                path = str(title).replace('?', '') ##测试过程中发现一个标题有问号
                mkdir(path)
                os.chdir('E:\Cover\\' + path)
                html = request.get(url, 3)
                Soup = BeautifulSoup(html.text, 'lxml')
                all_a = Soup.find('div', id='gdt').find_all('a')
                for a in all_a:
                    href = a['href']
                    name = a.img['alt']
                    html = request.get(href, 3)
                    html_soup = BeautifulSoup(html.text, 'lxml')
                    img = html_soup.find('div', id='i3').find('img')
                    img_url = img['src']
                    img_queue.push_imgurl(title,img_url)
                    print(u'得到图片的链接')
                    save(img_url, name)
                    img_queue.complete(img_url)
                crawl_queue.complete(url)

    def save(img_url,page_name):
        name=page_name
        print(u'开始保存:', img_url,'\n')
        img=request.get(img_url,3)
        f=open(name+'.jpg','ab')
        f.write(img.content)
        f.close()

    def mkdir(path):
        path = path.strip()
        isExists = os.path.exists(os.path.join("E:\Cover", path))
        if not isExists:
            print(u'建了一个名字叫做', path, u'的文件夹!')
            os.makedirs(os.path.join("E:\Cover", path))
            return True
        else:
            print(u'名字叫做', path, u'的文件夹已经存在了!')
            return False

    threads = []
    while threads or crawl_queue:
        """
        这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据
        threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行
        """
        for thread in threads:
            if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时
            thread = threading.Thread(target=pageurl_crawler) ##创建线程
            thread.setDaemon(True) ##设置守护线程
            thread.start() ##启动线程
            threads.append(thread) ##添加进线程队列
        time.sleep(SLEEP_TIME)