def parse(self, response): i = NovelItem() i['content'] = "\n".join( response.css('#chaptercontent::text').extract()) i['gmtCreate'] = datetime.now() headline = response.css('title').extract_first() headlines = headline.split('_') chapter = headlines[0].split('章')[0] i['chapter'] = chapter i['title'] = headlines[0].split('章')[1] i['bookName'] = headlines[1].strip() i['author'] = self.rds.hget(i.get('bookName'), "author").decode('utf-8') # 处理 一二一二 这种诡异数据 if chapter.find('十') > 0 or chapter.find('百') > 0 or chapter.find( "千") > 0: i['number'] = cn2an.cn2an(re.sub('.*?第', '', chapter)) else: i['number'] = int(self.convert_to_inter(chapter)) i['ptPrev'] = urljoin( self.start_urls[0], response.css('#pt_prev::attr(href)').extract_first()) i['url'] = response.url yield i
def parse(self, response): selector = Selector(response) table = selector.xpath('//table') for each in table: book_name = each.xpath( 'tr/td[@colspan="3"]/center/h2/text()').extract()[0] print book_name contents = each.xpath('tr/td/a/text()').extract() urls = each.xpath('tr/td/a/@href').extract() for idx, content in enumerate(contents): content_list = content.split(' ') item = NovelItem() if len(content_list) == 2: item['chapter_name'] = content_list[1][-3:] elif len(content_list) == 3 or len(content_list) > 4: item['chapter_name'] = ' '.join(content_list[2:]) elif len(content_list) == 4: item['chapter_name'] = content_list[3] if len(content_list) < 4: item['book_title'] = content_list[0] item['chapter_num'] = content_list[1] else: item['book_title'] = ' '.join(content_list[:2]) item['chapter_num'] = content_list[2] item['chapter_url'] = urls[idx] item['book_name'] = book_name print 'craw:', urls[idx] yield Request(urls[idx], callback="parseContent", meta={'item': item})
def parse(self, response): """ 解析小说信息及章节列表 :param response: :return: """ # 提取小说信息 item = NovelItem() item['number'] = response.url.split('/')[-2] item['name'] = response.xpath( '//div[@id="info"]/h1/text()').extract_first().strip() item['author'] = response.xpath( '//div[@id="info"]/p[1]/text()').extract_first().replace( '作 者:', '').strip() item['cover'] = response.xpath( '//div[@id="fmimg"]/img/@src').extract_first() item['origin_url'] = response.url yield item # 提取章节列表 yield from [ Request(chapter_url, meta={'novel_number': item['number']}, callback=self.parse_chapter) for chapter_url in response.xpath('//div[@id="list"]/dl/dd/a/@href').extract()[9:] ]
def parse_page_infos(self, response): item = NovelItem() item['noveldetailes'] = response.xpath( '//div[@class="p"]/text()').extract() item['title'] = response.xpath( '//div[@class="readAreaBox content"]/h1/text()').extract() yield item
def parse_book(self, response): """ 1. 提取小说简介信息返回 NovelItem 2。 爬取未爬取的章节 :param response: :return: """ url = response.url name = response.css('#info h1::text').extract_first() author = response.css('#info p::text').re_first('作\xa0\xa0\xa0\xa0者:(.*)') status = response.css('#info p:nth-child(3)::text').re_first('状\xa0\xa0\xa0\xa0态:(.*)').replace(',', '') update_time = response.css('#info p:nth-child(4)::text').re_first('最后更新:(.*)') last_chapter = response.css('#info p:nth-child(5) a::text').extract_first() item = NovelItem() for field in item.fields: try: item[field] = eval(field) except NameError: self.logger.debug('Field is not Defined' + field) yield item base_url = 'https://www.biquge.com.cn' chapters = response.css('#list > dl > dd') for chapter in chapters: url = base_url + chapter.css('a::attr(href)').extract_first() title = chapter.css('a::text').extract_first().strip() if not self.exists(name, title): yield scrapy.Request(url, callback=self.parse_detail) else: self.logger.debug(f'已存在不保存 《{name}》 {title}')
def parse_item(self, response): #题目 //div[@id='navList']//ul/li/span/a[1]/@title #作者 //div[@id='navList']//ul/li/span/a[2]/text() #bookid //div[@id='navList']//ul/li/span/a[3]/@href http://www.quanshuwang.cn/book_197.html #类别 //div[@id='navList']/div/a[2] #简介 //div[@id='navList']//ul/li/span/em/text() #图片//div[@id='navList']//ul/li/a/img/@src #提取当前页面所有小说 books = response.xpath("//div[@id='navList']//ul/li") #提取小说类别 category = response.xpath("//div[@id='navList']/div/a[2]/text()").get() for book in books: #提取小说名称 name = book.xpath(".//a[1]/@title").extract_first() #提取作者名字 author = book.xpath(".//a[2]/text()").extract_first() #提取小说简介 intro = book.xpath(".//em/text()").extract_first() #提取小说url novel_url = book.xpath(".//a[3]/@href").extract_first() #提取小说id novel_id = novel_url.split('_')[1].split(".")[0] #小说封面 img_src = book.xpath("./a/img/@src").extract_first() item = NovelItem() item["name"] = name item["author"] = author item["intro"] = intro item["novel_url"] = novel_url item["novel_id"] = novel_id item["category"] = category item["img_src"] = img_src yield item
def parse(self, response): item = NovelItem() selector = Selector(response) books = selector.xpath('//div[@class="book-mid-info"]') for book in books: name = book.xpath('h4/a/text()').extract() author = book.xpath( 'p[@class="author"]/a[@class="name"]/text()').extract() type = book.xpath( 'p[@class="author"]/a[@data-eid="qd_C42"]/text()').extract() state = book.xpath('p[@class="author"]/span/text()').extract() intro = book.xpath('p[@class="intro"]/text()').extract() update = book.xpath( 'p[@class="update"]/a[@target="_blank"]/text()').extract() href = book.xpath('p[@class="update"]/a/@href').extract() time = book.xpath('p[@class="update"]/span/text()').extract() item['book_name'] = name[0] item['author'] = author[0] item['book_type'] = type[0] item['book_state'] = state[0] item['book_update'] = update[0] item['book_time'] = time[0] item['new_href'] = 'https:' + href[0] item['book_intro'] = ''.join(intro).replace(' ', '').replace('\n', '') yield item
def parse_page(self, response): base_path = 'G:/novel/' name = response.xpath("//*[@id='direct']/a[3]/text()").get() chapter = response.xpath("//*[@id='direct']/text()[4]").get() content = response.xpath("//*[@id='content']/text()").getall() it = NovelItem(name=name, chapter=chapter, content=content) yield it
def parse_content(self, response): # yield { # 'name:': response.css("div.bookname h1::text").get(), # 'content:': response.css("div#content::text").getall(), # } item = NovelItem() item['name'] = response.css("div.bookname h1::text").get() item['content'] = response.css("div#content::text").getall() yield item
def after_parse(self, response): item = NovelItem() item['title'] = response.xpath( '//div[@class="title"]/h1/text()').extract_first() print(item['title']) item['text'] = ''.join( response.xpath('//div[@id="content"]/text()').extract()).replace( '\xa0', '').replace('\r\n', '') #print(item['text']) yield item
def parse_chapter(self, response): html_chapter = response.body soup = BeautifulSoup(html_chapter, "html.parser") item = NovelItem() item['chapter'] = soup.find('div', attrs={"class": "bookname"}).h1.text item['content'] = soup.find('div', attrs={ "id": "content" }).text.replace("<br>", '') yield item
def parse(self, response): item = NovelItem() item["content"] = response.xpath( '//pre[@id="content"]/text()').extract()[0].replace('\n', '').replace( ' ', '') item['title'] = response.xpath( '//div[@class="page-body"]/h1/text()').extract()[0] item["url"] = response.xpath("//@href").extract_first() yield item
def get_content(self, response): title = response.xpath('//h3[@class="j_chapterName"]/text()').extract_first() # xpath 的extract()方法返回一个元组, 里面是一个列表,列表中含有N个元素(小说里一个p标签为一个元素) body = response.xpath("//div[@class='read-content j_readContent']//p/text()").extract(), content = ''.join(body[0]) item = NovelItem() item['title'] = title item['content'] = content logger.info('%s has grabed, content: %s' % (title, content)) yield item
def process_content(self, response): novel = response.meta['novel'] rs = urlparse(response.url) item = NovelItem() item['title'] = response.selector.xpath( '//div[@class="bookname"]/h1/text()').extract()[0] item['content'] = response.selector.xpath( '//div[@id="content"]/text()').extract() item['novel_id'] = novel.id item['chapter_source_url'] = rs.path return item
def parse_chapters(self, response): item = response.meta.get('item', NovelItem()) chapter_title = response.meta.get('chapter_title').replace('.','') sel = Selector(response) content = "\n".join(sel.xpath('//div[@class="read-content j_readContent"]').xpath('string(.)').extract()).encode('utf-8').strip() oss_value = self.oss.uploadPage(content) print item['title'] + ': ' + chapter_title + ' ' + oss_value item['chapter_content'][chapter_title] = oss_value if len(item['chapter_content']) >= item['chapter_num']: print 'finished ' + item['title'] + ': ' + item['author'] yield item
def get_content(self, response): item = NovelItem() title = response.xpath("/html/body/div[2]/h1/text()").extract()[0] title = title.replace("章节目录 ", "") content = response.xpath("//*[@id='content']/text()").extract() content = "".join(content) replace_list = env.replace_list for rep in replace_list: content = content.replace(rep,"") item['title'] = title item['content'] = content item['tid'] = c2n.Cn2An(c2n.get_tit_num(title[title.find("第")+1:title.find("章")])) return item
def parse_item(self,response): name = self.novel[response.url] content = response.text.encode(response.encoding).decode('gb18030').replace('<br>','\n').replace(' ','').\ replace('document.write(\'','').replace('</content>\');','').replace('<content>','') item = NovelItem() item['novelName'] = self.novelName item['name'] = name item['num'] = self.novelNum[name] item['content'] = content print(item) yield item
def parse(self, response): item = NovelItem() title_list = response.xpath('//p[@class="title"]/a') # print(title_list) for title in title_list: item['novel_title'] = title.xpath('.//text()').get() href = title.xpath('.//@href').get() novel_url = self.base_url + href print(item['novel_title']) yield scrapy.Request(url=novel_url, callback=self.parse_content, meta={"item": item})
def table(self, response): soup = BeautifulSoup(response.body, 'lxml') table = soup.find(id='list').find_all(name='dd') for chapter in table: item = NovelItem() item['novel'] = response.meta['novel'] item['novel_url'] = response.meta['novel_url'] chapter_url = 'http://www.xbiquge.la/' + chapter.a.get('href') item['chapter'] = chapter.text item['chapter_url'] = chapter_url yield scrapy.Request(url=chapter_url, meta={'item': item}, callback=self.content)
def get_content(self, response): item = NovelItem() item['name'] = str(response.meta['name']) item['url'] = str(response.meta['url']) category = BeautifulSoup(response.text, 'lxml').find('table').find('a').get_text() author = BeautifulSoup( response.text, 'lxml').find('table').find_all('td')[1].get_text() bash_url = BeautifulSoup(response.text, 'lxml').find( 'p', class_='btnlinks').find('a', class_='read')['href'] name_id = str(bash_url)[-6:-1] print(response.text)
def parse(self, response): sel = Selector(response) element = sel.xpath('//ul[@class="all-img-list cf"]/li') for el in element: item = NovelItem() item['title'] = el.xpath('.//div[@class="book-mid-info"]//h4/a/text()').extract_first() item['author'] = el.xpath('.//p[@class="author"]//a[@class="name"]/text()').extract_first() item['category'] = el.xpath('.//p[@class="author"]//a[@data-eid="qd_B60"]/text()').extract_first() item['sub_category'] = el.xpath('.//p[@class="author"]//a[@class="go-sub-type"]/text()').extract_first() item['status'] = 0 if el.xpath('.//p[@class="author"]//span/text()').extract_first() == '完本' else 1 item['url'] = 'https:' + el.xpath('.//div[@class="book-mid-info"]//h4/a/@href').extract_first() item['abstract'] = el.xpath('.//p[@class="intro"]/text()').extract_first().strip() item['word_num'] = el.xpath('.//p[@class="update"]/span/text()').extract_first() item['cover'] = 'https:' + el.xpath('.//div[@class="book-img-box"]/a/img/@src').extract_first() yield Request(url=item['url'], meta={'item':item}, callback=self.parse_details)
def parse(self, response): nolves = response.xpath('//ul[@class="all-img-list cf"]/li') for each in nolves: # print("***************************") item = NovelItem() part = each.xpath('./div[@class="book-mid-info"]') #print(part) item['bookname'] = part.xpath('./h4/a/text()').extract() item['link'] = part.xpath('./h4/a/@href').extract()[0] item['author'] = part.xpath( './p[@class="author"]/a[@class="name"]/text()').extract() item['category'] = part.xpath( './p[@class="author"]/a/text()').extract() item['content'] = part.xpath( './p[@class="intro"]/text()').extract() yield item
def parse_novel(self, response): """ 对每一个novel进行解析,获取章节地址 """ item = NovelItem() item['url'] = response.url item['id'] = int(response.url.split('/')[-2]) item['name'] = response.meta['novel_name'] yield item soup = BeautifulSoup(response.text, 'lxml') chapter_list = soup.find('table').find_all('a') for chapter in chapter_list: chapter_url = item['url'] + chapter['href'] yield Request(chapter_url, callback=self.parse_chapter, meta={'novel_id':item['id'], 'novel_name':item['name']})
def parse(self, response): title = response.xpath( "/html/body/div[4]/div[2]/h1/text()").extract_first() #extract_first提取第一个符合的元素 content = response.xpath( "/html/body/div[4]/div[2]/div[2]/text()").extract()[:-2] #extract提取满足条件的所有元素,返回一列表,最后两行为广告故舍去 self.f.close() fw = open('%s.txt' % title, 'w', encoding='utf-8') fw.writelines(title) fw.writelines(content) fw.close() self.sendUpdate(title) #发送邮件 item = NovelItem() item["title"] = title item["content"] = content yield item
def parse_details(self, response): item = response.meta.get('item', NovelItem()) sel = Selector(response) prefix_url = 'https://book.qidian.com/ajax/comment/index?_csrfToken=noZtdawi6Zu8sYFtR2m2o3ujn4lyQLrauItBqnzG&bookId=' item['book_id'] = item['url'][item['url'].rfind('/')+1:] item['score'] = json.loads(requests.get(prefix_url + item['book_id']).content)['data']['rate'] item['intro'] = sel.xpath('//div[@class="book-info "]//p[@class="intro"]/text()').extract_first() click_num = sel.xpath('//div[@class="book-info "]/p[3]/em[2]/text()').extract_first() item['click_num'] = click_num + sel.xpath('//div[@class="book-info "]/p[3]/cite[2]/text()[1]').extract_first().replace('总点击','') item['chapter_num'] = int(sel.xpath('//li[@class="j_catalog_block"]//span[@id="J-catalogCount"]/text()').extract_first()[1:-2]) recommend_num = sel.xpath('//div[@class="book-info "]/p[3]/em[3]/text()').extract_first() item['recommend_num'] = recommend_num + sel.xpath('//div[@class="book-info "]/p[3]/cite[3]/text()[1]').extract_first().replace('总推荐','') item['created_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['chapter_content'] = {} chapter_el = sel.xpath('//div[@class="volume-wrap"]//div[@class="volume"]/ul/li') for i, el in enumerate(chapter_el): chapter_url = 'https:' + el.xpath('.//a/@href').extract_first() chapter_title = el.xpath('.//a/text()').extract_first() yield Request(url=chapter_url, meta={'item':item, 'chapter_title':chapter_title}, callback=self.parse_chapters)
def parse(self, response): # 搜索结果 website = get_web_site(response.url) dd = get_config(website, config_list) info = response.xpath(dd["searchInfo"]) for node in info: search_novel = NovelItem() search_novel["name"] = filter_null( node.xpath(dd["novelName"]).extract()) search_novel["url"] = filter_null( node.xpath(dd["novelUrl"]).extract()) search_novel["url"] = add_head(dd["websiteUrl"], search_novel["url"]) search_novel["website"] = dd["websiteName"] search_novel["chaptersUrl"] = search_novel["url"] search_novel["author"] = filter_null( node.xpath(dd["novelAuthor"]).extract()) search_novel["lastChapter"] = filter_null( node.xpath(dd["novelLatestChapter"]).extract()) search_novel["lastChapterUrl"] = delete_char( dd["websiteUrl"], search_novel["url"], filter_null(node.xpath(dd["novelLatestChapterUrl"]).extract())) search_novel["wordCount"] = filter_null( node.xpath(dd["wordContent"]).extract()) search_novel["updateTime"] = filter_null( node.xpath(dd["novelLastUpdateTime"]).extract()) search_novel["synopsis"] = filter_null( node.xpath(dd["novelDescription"]).extract()) search_novel["type"] = filter_null( node.xpath(dd["novelType"]).extract()) search_novel["img"] = filter_null( node.xpath(dd["novelImgUrl"]).extract()) search_novel["img"] = add_head(dd["websiteUrl"], search_novel["img"]) search_novel["status"] = filter_null( node.xpath(dd["novelStatus"]).extract()) yield scrapy.Request(search_novel['url'], callback=self.parse_info, meta={ 'novel': search_novel, 'dd': dd })
def parse_chapter(self, response): chapter_url = response.url book_name_pinyin = "".join(chapter_url.split('/')[-2:-1]) chapter_id = "".join(chapter_url.split('/')[-1:]).replace(".html", "") page = response.selector chapter_content = "\n".join( page.xpath('.//div[@id="content"]/p/text()').getall()) # print('content',chapter_content) response.meta["book_name_pinyin"] = book_name_pinyin response.meta["chapter_id"] = chapter_id response.meta["chapter_content"] = chapter_content response.meta["chapter_url"] = chapter_url list_more_code = re.findall(self.pattern_more_url, response.text) if len(list_more_code) > 0: more_code = list_more_code[0] content_more_url = "%s/index.php?c=book&a=read.jsonp&callback=%s&pinyin=%s&id=%s" % ( self.base_url, more_code, book_name_pinyin, chapter_id) yield Request(content_more_url, callback=self.parse_more_content, headers={ 'Host': 'www.quanben.io', 'Referer': chapter_url }, meta=copy.copy(response.meta)) else: novel_item = NovelItem() novel_item['book_type'] = response.meta["type_name"] novel_item['book_name'] = response.meta["book_name"] novel_item['book_url'] = response.meta["book_url"] novel_item['book_thumb'] = response.meta["book_thumb"] novel_item['book_name_pinyin'] = response.meta["book_name_pinyin"] novel_item['book_author'] = response.meta["book_author"] novel_item['book_desc'] = response.meta["book_desc"] novel_item['chapter_name'] = response.meta["chapter_name"] novel_item['chapter_id'] = response.meta["chapter_id"] novel_item['chapter_url'] = response.meta["chapter_url"] novel_item['chapter_content'] = chapter_content print(response.meta["book_name"], response.meta["chapter_name"], response.meta["chapter_url"]) yield novel_item
def parse(self, response): books = response.xpath('//dd/table/tr[@bgcolor="#FFFFFF"]') for book in books: item = NovelItem() item['novel_url'] = book.xpath('./td[1]/a[2]/@href').extract()[0] item['name'] = book.xpath('./td[1]/a[2]/text()').extract()[0] item['author'] = book.xpath('./td[3]/text()').extract()[0] item['numbers'] = book.xpath('./td[4]/text()').extract()[0] item['last_time'] = book.xpath('./td[5]/text()').extract()[0] item['status'] = book.xpath('./td[6]/text()').extract()[0] yield item yield scrapy.Request(item['novel_url'], callback=self.get_chapter, meta={ 'novel_url': item['novel_url'], 'name': item['name'] }) next_page = response.xpath( '//dd[@class="pages"]/div/a[12]/@href').extract() if next_page: yield scrapy.Request(next_page[0])
def parse(self, response): logging.info('#####NovelSpider:parse()#####') novelitem = NovelItem() content = response.xpath("//div[@class='content']/div") novelitem['picture'] = content[0].xpath( "//div[@class='imgShow']/img/@src").extract()[0] novelitem['name'] = content[0].xpath( "//div[@class='tit']/h1/text()").extract()[0].strip() novelitem['status'] = content[0].xpath( "//div[@class='tit']/span/text()").extract()[0].strip() novelitem['author'] = content[0].xpath( "//div[@class='author']//a/text()").extract()[0].strip() novelitem['author_href'] = 'http://book.easou.com' + content[0].xpath( "//div[@class='author']//a/@href").extract()[0] novelitem['type'] = content[0].xpath( "//div[@class='kind']//a/text()").extract()[0].strip() novelitem['type_href'] = 'http://book.easou.com' + content[0].xpath( "//div[@class='kind']//a/@href").extract()[0] novelitem['update_time'] = content[0].xpath( "//div[@class='updateDate']/span/text()").extract()[0] novelitem['source'] = content[0].xpath( "//div[@class='source']/span/text()").extract()[0].strip() novelitem['description'] = content[0].xpath( "//div[@class='desc']/text()").extract()[0].strip() novelitem['latest_chapters'] = content[0].xpath( "//div[@class='last']/a/text()").extract()[0].strip().split(' ')[1] novelitem['chapters_categore_href'] = content[0].xpath( "//div[@class='allcategore']//a/@href").extract()[0] logging.info('#####NovelSpider:parse():novelitem info:{0}#####'.format( novelitem)) yield scrapy.Request('http://book.easou.com' + novelitem['chapters_categore_href'], method='GET', callback=self.get_page_urls, meta={'novel_detail': novelitem})
def parse_detail(self, response): item = NovelItem() print(response.url) novel_title = response.xpath( '//div[@class="introduce"]/h1/text()').extract()[0] novel_time = response.xpath( '//div[@class="introduce"]/p[2]/span[1]/text()').extract()[0] novel_name = response.xpath( '//div[@class="introduce"]/p[2]/span[2]/a/text()').extract()[0] novel_state = response.xpath( '//div[@class="introduce"]/p[2]/span[3]/text()').extract()[0] novel_brief_introduction = response.xpath( '//p[@class="jj"]/text()').extract()[0] novel_urls = response.xpath('//div[@class="ml_list"]/ul/li') try: novel_url = novel_urls.xpath('./a/@href').extract() chapter_name = novel_urls.xpath('./a/text()').extract() except: novel_url = novel_urls.xpath('./b/@onclick').extract().repacle( "window.open('", "").replace("')", "") chapter_name = novel_urls.xpath('./b/text()').extract() # print(novel_url) # print(chapter_name) item['novel_title'] = novel_title item['novel_time'] = novel_time item['novel_name'] = novel_name item['novel_state'] = novel_state item['novel_brief_introduction'] = novel_brief_introduction item['novel_url'] = response.url # item['chapter_name'] = chapter_name yield item # //span[@id="articlecontent"]/text()