class spider_detail_list(scrapy.Spider): name = "spider_detail_list" #要调用的名字 allowed_domains = ["qidian.com"] #分一个域 red=getredis() urls=red.lrange('novel_list',0,-1) start_urls = [] ids=[]# 方法二 dict={}# 方法一 for url in urls: url = str(url, encoding="utf-8") url = url.split(',') start_urls.append(url[1]) # ids.append(url[0]) # 方法二 dict[url[1]]=url[0] # 方法一 #每爬完一个网页会回调parse方法 def parse(self, response): # 方法一:由于url是线程故无法判断id 是不是对应的url Pid=self.dict[response.url] bcollection =getMongodb() links = response.xpath('//div[@class="sub-type"]/dl[@class=""]/dd/a') for link in links: print("***************") print(Pid) print(link.select("text()").extract()[0]) print(link.select('@href').extract()[0]) print("***************") id = bcollection.insert({'list_child_name': link.select("text()").extract()[0], 'pid': Pid}) self.red.lpush('bnovel_all_list', str(id) + "," + "https:" + link.select('@href').extract()[0])
class spider_list_novel(scrapy.Spider): name = "spider_list_novel" #要调用的名字 allowed_domains = ["qidian.com"] #分一个域 start_urls = [] dict = {} red = getredis() mongodb=getMongodb('novel','novels') def __init__(self): urls = self.red.lrange('bnovel_all_list', 0, -1) for url in urls: url = str(url, encoding="utf-8") url = url.split(',') spider_list_novel.start_urls.append(url[2]) spider_list_novel.dict[url[2]] ={'classId':url[0],'listId':url[1],'sum':0} # break #每爬完一个网页会回调parse方法 def parse(self, response): print(response.url) Pid = self.dict[response.url] Pid['sum']+=1 print(Pid['sum']) if Pid['sum']>3: return links = response.xpath('//div[@class="book-mid-info"]/h4/a') for link in links: novel_name = link.select("text()").extract()[0] novel_id = self.mongodb.insert({'name': novel_name, 'total_list': Pid['classId'], 'list': Pid['listId']}) href = link.select("@href").extract()[0] href = str(novel_id) + ',' + 'https:' + href print(href) self.red.lpush('all_novel_href',href) sleep(0.3) href=self.find_next(response) if href==None: f = open('file/%s.txt' % ("日志"), 'a', encoding='utf-8') f.write(response.url) f.write('++++++++++++++') f.close() else: href="https:"+href if href.find('javascript:;')<0: self.dict[href] = Pid request=Request(href,callback=self.parse) yield request def find_next(self,response): try: hrefs =response.xpath('//li[@class="lbf-pagination-item"]/a') i=len(hrefs) href=hrefs[i - 1].select("@href").extract()[0] return href except Exception as err: f = open('file/%s.txt' % ("日志"), 'a', encoding='utf-8') f.write(str(err)+':'+href) f.close() return None
def parse(self, response): red = getredis() bcollection = getMongodb() hx = response.xpath( '//div[@class="work-filter type-filter"]/ul/li/a|//div[@class="work-filter type-filter"]/ul/li/a' ) for i in range(1, len(hx)): print(hx[i].select("@href").extract()[0]) # 取长度 print(hx[i].select("text()").extract() [0]) # 取长度str(hx[i].select("@href").extract()[0]) id = bcollection.insert( {'list_name': str(hx[i].select("text()").extract()[0])}) red.lpush( 'novel_list', str(id) + "," + "https:" + str(hx[i].select("@href").extract()[0]))
class spider_type_list(scrapy.Spider): name = "spider_type_list" #要调用的名字 allowed_domains = ["qidian.com"] #分一个域 start_urls = [] red = getredis() urls = red.lrange('bnovel_all_list', 0, -1) dict={} for url in urls: url = str(url, encoding="utf-8") url = url.split(',') start_urls.append(url[1]) dict[url[1]] = url[0] #每爬完一个网页会回调parse方法 def parse(self, response): print("**********") links=response.xpath('//div[@class="book-mid-info"]/h4/a') for link in links: print(link.select("text()").extract()[0]) print(link.select("@href").extract()[0]) print("++++++++++++")
class spider_list_novel(scrapy.Spider): name = "spider_list_novel" #要调用的名字 allowed_domains = ["qidian.com"] #分一个域 start_urls = [] dict = {} red = getredis() mongodb = getMongodb('novel', 'novels') def __init__(self): urls = self.red.lrange('all_novel_href', 0, 5) for url in urls: url = str(url, encoding="utf-8") url = url.split(',') spider_list_novel.start_urls.append(url[1]) spider_list_novel.dict[url[1]] = url[0] # break #每爬完一个网页会回调parse方法 def parse(self, response): print(response.url) Pid = self.dict[response.url] print(Pid) links = response.xpath('//div[@class="book-mid-info"]/h4/a')
class spider_detail_novel(scrapy.Spider): name = "spider_detail_novel" #要调用的名字 allowed_domains = ["qidian.com"] #分一个域 start_urls = [] dict = {} red = getredis() mongodb = getMongodb('novel', 'novels') def __init__(self): urls = self.red.lrange('all_novel_href', 0, -1) for url in urls: url = str(url, encoding="utf-8") url = url.split(',') spider_detail_novel.start_urls.append(url[1]) spider_detail_novel.dict[url[1]] = url[0] #每爬完一个网页会回调parse方法 def parse(self, response): global status_flag id = self.dict[response.url] Pid = (ObjectId(id)) detail_messages = response.xpath('//div[@class="book-info "]') # 爬取详细信息 for detail_message in detail_messages: author = detail_message.select('//h1/span/a/text()').extract()[0] status = detail_message.select('p/span/text()').extract()[0] if status == "连载": status_flag = 0 else: status_flag = 1 # 更新mongodb self.mongodb.update( {"_id": Pid}, {"$set": { 'author': author, 'status': status_flag }}) novel_href = "https:" + detail_message.select( 'p/a/@href').extract()[2] # 爬取小说 request = Request( novel_href, callback=lambda response, id=id, status_flag=status_flag: self. spider_one_novel(response, id, status_flag)) yield request def spider_one_novel(self, response, id, status_flag): chapter_mongodb = getMongodb('novel', 'chapters') chapter = response.xpath( '//h3[@class="j_chapterName"]/text()').extract()[0] print('********处理内容*******') contents = response.xpath( '//div[@class="read-content j_readContent"]/p/text()').extract() novel_names = response.xpath( '//div[@class="book-cover-wrap"]/h1/text()').extract() novel_name = response.xpath( '//div[@class="crumbs-nav"]/a[@class="act"]/text()').extract()[0] if len(novel_names) != 0: os.makedirs('D:/all_novels/%s' % novel_names[0]) else: pass f = open('D:/all_novels/%s/%s.html' % (novel_name, chapter), 'w', encoding='utf-8') file_path = 'D:/all_novels/%s/%s.html' % (novel_name, chapter) # 存入mongodb中 chapter_mongodb.insert({chapter: file_path, 'pid': id}) for content in contents: f.write(content) f.write('<br>') f.close() print('+++++++++++++++++++++') next_chapter = "https:" + response.xpath( '//a[@id="j_chapterNext"]/@href').extract()[0] if next_chapter.find('lastpage') > 0: if status_flag == 0: self.red.lpush('serialize_list', id + ',' + response.url) return None print('+++++++++++++++++++++') request = Request( next_chapter, callback=lambda response, id=id, status_flag=status_flag: self. spider_one_novel(response, id, status_flag)) yield request
def pushRedis(self, pid, cid, href): red = getredis() href = "%s,%s,%s" % (pid, cid, href) red.lpush('bnovel_all_list', href)