def parse(self, response): item = QidianItem() # page = response.css('#page-container::attr(data-pagemax)').extract() 获取页数 # names = response.css('#rank-view-list > div > ul > li > div.book-mid-info > h4 > a::text').extract() 测试获取小说名字 for i in response.css( '#rank-view-list > div > ul > li > div.book-mid-info > h4'): item = QidianItem() item['name'] = i.css('a::text').extract()[0] item['url'] = "http:" + i.css('a::attr(href)').extract()[0] yield item # for name in names: 前期测试 # item['name'] # item['name'] = name # yield item # for pageurl in (1,int(page[0])+1): 前期测试 # tempurl = r"https://www.qidian.com/rank/yuepiao?chn=21&page=" + str(pageurl) # item['pages'] = tempurl # print("这里需要打印" + str(page[0])) 测试打印数据毛,看看师傅正确 # yield item next_page = response.css( '#page-container::attr(data-pagemax)').extract() # print(next_page) 下一页,翻页 if next_page is not None: naxt_page = int(next_page[0]) + 1 for i in range(1, naxt_page): url = response.urljoin( r"https://www.qidian.com/rank/yuepiao?chn=21&page=" + str(i)) #翻页 # yield scrapy.Request(url, self.parse) 老写法 yield response.follow(url, callback=self.parse)
def parse_content(self, response): title_sum = [] for bb in response.xpath('//div[@class="main-text-wrap"]'): title = bb.xpath( '//div[@class="text-head"]/h3[@class="j_chapterName"]/text()' ).extract() content = bb.xpath( '//div[@class="read-content j_readContent"]/p/text()').extract( ) #print(title) ###切分list,得到空格的位置 kong_list = list(''.join(title)) #print(type(kong_list)) a = ' ' if a in kong_list: ###如果空格在这个list里面,则.. kong_ge = list(''.join(title)).index(' ') ###得到空格的下标位置 #print(kong_list[1:kong_ge-1]) ###得到第几章里面的几章 kong_ge_str = "".join(kong_list[1:kong_ge - 1]) ###list转换为str #print(kong_ge_str) title_sum.append(kong_ge_str) # else: # print(kong_list) #title_sum.append(title) #####################得到的title是乱序的,此步骤要解决乱序的问题,要按照正序排列,开始######## #print(title,len(title)) ###我们打算用冒泡排序来解决这个问题### #####################得到的title是乱序的,此步骤要解决乱序的问题,要按照正序排列,结束######## item = QidianItem() item[ 'title'] = title_sum ######sort(reverse = False) ###reverse = False 升序(默认)。sorted(n,key=lambda x:CN[x]) #item['content']=content yield item
def parse(self, response): bot = Selector(response) csrfToken = self.get_cookies('_csrfToken', response) contents = bot.xpath('//tbody/tr') for content in contents: item = QidianItem() item['book_type'] = content.xpath( 'td[1]/a[1]/text()').extract_first() item['book_sub_type'] = content.xpath( 'td[1]/a[2]/text()').extract_first() item['book_name'] = content.xpath( 'td[2]/a[1]/text()').extract_first() item['book_url'] = 'https:' + content.xpath( 'td[2]/a[1]/@href').extract_first() item['total_words'] = content.xpath( 'td[4]/span/text()').extract_first() item['author'] = content.xpath('td[5]/a/text()').extract_first() item['last_upload_date'] = content.xpath( 'td[6]/text()').extract_first() yield scrapy.Request(item['book_url'], meta={ 'item': item, 'csrfToken': csrfToken }, callback=self.parse_detail)
def parse(self, response): li_list = response.css(".book-img-text li") for li in li_list: item = QidianItem() item["title"] = li.css(".book-mid-info h4 a::text")[0].extract() item["url"] = "https:" + li.css( ".book-mid-info h4 a::attr(href)")[0].extract() item["author"] = li.css(".book-mid-info .author a")[0].xpath( "./text()")[0].extract() category = "" a_list = li.css(".book-mid-info .author a")[1:] for a in a_list: a_text = a.css("a::text")[0].extract() category += a_text category += " " item["category"] = category.strip() item["status"] = li.css( ".book-mid-info .author span::text")[0].extract() item["bref"] = li.css( ".book-mid-info .intro::text")[0].extract().strip() yield scrapy.Request(item['url'], callback=self.book_intro, meta={"item": item}, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/63.0.3239.132 Safari/537.36" })
def parse_book(self, response): #分析页面网站 head = response.xpath("//div[@class='book-info ']") #总入口 title = head.xpath(".//h1/em/text()").get() #文章标题 author = head.xpath(".//h1/span/a/text()").get() #作者 statu = head.xpath(".//p/span/text()").getall() #文章状态 status = '' for status1 in statu: status = status + status1 + ' ' type1 = head.xpath(".//p[@class='tag']/a/text()").getall() #文章类别 type = '' for ty in type1: type = type + ty + ' ' brief = head.xpath(".//p[@class='intro']/text()").get() #文章简介 image = response.xpath( ".//div[@class='book-img']//img/@src").get() #文章封面 image = response.urljoin(image) contents = response.xpath( ".//div[@class='book-intro']/p/text()").getall() #文章内容 contents = list(map(lambda content: content.strip(), contents)) content = '' for content1 in contents: content = content + content1 + '\n' url = response.url item = QidianItem(title=title, author=author, status=status, type=type, brief=brief, contents=content, image=image, url=url) yield item
def parse_page(self, response): if self.num >= 21000: return selector = Selector(response) item = QidianItem() item['url'] = response.url book_info = selector.xpath('//div[@class="book-info "]') item['link'] = book_info.xpath( '//a[@class="red-btn J-getJumpUrl "]/@href').extract()[0] item['name'] = book_info.xpath('h1/em//text()').extract()[0] ast = selector.xpath('//div[@class="book-intro"]/p')[0] abcst = ast.xpath('text()').extract() abst = "" for d in abcst: abst += d item['intro'] = abst.replace('\n', '') item['intro'] = abst.replace('\r', '') item['intro'] = abst.replace('\t', ' ') item['intro'] = abst.strip() tmp = book_info.xpath('p') book_tags = tmp[0] book_tags_hrefs = book_tags.xpath('a/text()') item['major_category'] = book_tags_hrefs[0].extract() na = item['name'].encode('utf-8') mc = item['major_category'].encode('utf-8') intr = item['intro'].encode('utf-8') li = item['link'].encode('utf-8') if li == "" or mc not in self.dic or self.dic[mc] >= self.max_number: return self.dic[mc] += 1 self.num += 1 yield Request("http:" + li, callback=self.parse_content) f0 = open('title2.txt', 'a') f1 = open('tag2.txt', 'a') f2 = open('abs2.txt', 'a') f3 = open('link2.txt', 'a') f0.write(str(na) + '\n') f1.write(str(mc) + '\n') f2.write(str(intr) + '\n') f3.write(str(li) + '\n') #print 'line82' + mc + '\n' #print 'line83 ' + str(self.dic) yield item
def parse_info(self,response): selector = etree.HTML(response.text) item = QidianItem() item['id'] = response.meta['id'] item['name'] =selector.xpath('//div[@class="book-info "]/h1/em/text()')[0] item['author'] =selector.xpath('//div[@class="book-info "]/h1/span/a/text()')[0] item['introduce'] =selector.xpath('//p[@class="intro"]/text()')[0] item['yuepiao'] =selector.xpath('//p[@class="num"]/i/text()')[0] item['dashang'] =selector.xpath('//i[@class="rewardNum"]/text()')[0] yield item
def parse_item(self, response): item = QidianItem() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() item['title'] = response.xpath( '//*[@id="divBookInfo"]/div[1]/h1/text()').extract()[0].strip() item['name'] = response.xpath( '//*[@id="divBookInfo"]/div[1]/span/a/span/text()').extract( )[0].strip() item['week_click'] = response.xpath( '//*[@id="contentdiv"]/div/div[1]/table/tr/td[2]/text()').extract( )[1].strip() return item
class SpiderSpider(scrapy.Spider): name = 'spider' allowed_domains = ['qidian.com'] start_urls = ['http://qidian.com/'] item = QidianItem() # 获取目标小说详情页 def parse(self, response): # 此处修改想要爬取的小说 url_list = response.xpath( '/html/body/div[1]/div[7]/div[1]/div/ul/li[1]/strong/a/@href' ).extract() for url in url_list: yield scrapy.Request(url='https:' + url, meta={'item': self.item}, callback=self.parse_one) def parse_one(self, response): item = response.meta['item'] item['text'] = [] item['chapter_name'] = [] item['name'] = response.xpath( '/html/body/div/div[6]/div[1]/div[2]/h1/em/text()').extract_first( ) chapter_list = response.xpath( '//*[@id="j-catalogWrap"]/div[2]/div/ul/li/a/@href').extract() yield scrapy.Request(url='https:' + chapter_list[0], meta={'item': self.item}, callback=self.parse_two) def parse_two(self, response): item = response.meta['item'] item['text_list'] = response.xpath( '//*[@class="read-content j_readContent"]/p/text()').extract() item['chapter_name'].append( response.xpath( '//*[@class="j_chapterName"]/span/text()').extract_first()) url = response.xpath('//*[@id="j_chapterNext"]/@href').extract_first() nextChapterVip = re.findall(r'g_data.nextChapterVip = (\d);', response.text)[0] nextId = re.findall(r'nextId :(.*?),', response.text)[0] item['text'].append(' \n\n'.join(item['text_list'])) if nextChapterVip == '0' and nextId != '-1': yield scrapy.Request(url='https:' + url, meta={'item': self.item}, callback=self.parse_two) else: yield item
def parse(self, response): # 小说类型列表 novel_type_list = response.xpath( '//dl[@class ="cf"]//dd//span//i/text()').extract() #小说类型URL列表 novel_type_url_list = response.xpath( '//dl[@class ="cf"]//dd//a/@href').extract() #小说类型和URL只选取前12种.如:玄幻,奇幻,武侠,仙侠,都市,现实,军事,历史,游戏,体育,科幻,悬疑灵异(因网页结构不同) for novel_type, novel_type_url in zip(novel_type_list[0:-2], novel_type_url_list[0:-2]): item = QidianItem() item['novel_type'] = novel_type #小说分类页url URL = "https://www.qidian.com" + novel_type_url yield scrapy.Request(URL, callback=self.parse_kind_parse, meta={'item': item})
def parse(self, response): items = response.css('.all-book-list li') data = QidianItem() for item in items: data['name'] = item.css('.book-mid-info a::text').extract_first() data['author'] = item.css('.author a.name::text').extract_first() data['img'] = item.css( '.book-img-box a img::attr(src)').extract_first() data['url'] = item.css( '.book-img-box a::attr(href)').extract_first() data['state'] = item.css('.author span::text').extract_first() data['type'] = item.css( '.author a.go-sub-type::text').extract_first() data['intro'] = item.css('.intro::text').extract_first().strip() data['auturl'] = item.css( '.author a.name::attr(href)').extract_first() yield data
def parse_content(self, response): for bb in response.xpath('//div[@class="main-text-wrap"]'): title = bb.xpath( '//div[@class="text-head"]/h3[@class="j_chapterName"]/text()' ).extract() content = bb.xpath( '//div[@class="read-content j_readContent"]/p/text()').extract( ) #print(title) ###切分list,得到空格的位置 kong_list = list(''.join(title)) #print(type(kong_li item = QidianItem() item[ 'title'] = title ######sort(reverse = False) ###reverse = False 升序(默认)。sorted(n,key=lambda x:CN[x]) item['content'] = content yield item
def parse(self, response): """ This function parses a property page. :param response: 请求 :return: 人会 @url http://vip.book.sina.com.cn/weibobook/cate.php?cate_id=1036&w=0&s=0&order=1&vt=4&page=3 @returns items 1 @scrapes book_id src title img_url state author chan_name synoptic platform platform_src """ l = ItemLoader(item=QidianItem(), response=response) l.add_xpath( 'book_id', '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="book_name"]/a/@href', re='[0-9]+') l.add_xpath( 'src', '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="book_name"]/a/@href' ) l.add_xpath( 'title', '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="book_name"]/a/text()' ) l.add_xpath( 'img_url', '//div[@class="book_list"]/ul//li/div[@class="img_box"]/a/img/@src' ) l.add_xpath( 'state', '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="book_author"]/text()', re='(?<=【).*?(?=】)') l.add_xpath( 'author', '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="book_author"]/span/text()' ) l.add_xpath('chan_name', '//div[@class="all-fr-title"]/text()') l.add_xpath( 'synoptic', '//div[@class="book_list"]/ul//li/div[@class="book_info"]/p[@class="info"]/a/text()' ) l.add_value('platform', '新浪读书') l.add_value('platform_src', 'http://vip.book.sina.com.cn') return l.load_item()
def get_third(self, response): # 一个章节P_list Third = (response.xpath("//div[@class='read-content j_readContent']//p")) # print(len(Third)) # print(Third) # 一章节内容 story = '' for i in Third: story += (i.xpath('./text()').extract()[0]).strip() + '\n' print(story) # print('=' * 30) # story = s + '\n' # # print('='*30) # print(story) item = QidianItem() item['story'] = story # return item yield item
def parse_item(self, response): item = QidianItem() item['book_name'] = response.xpath( '//div[@class="book-info "]/h1/em/text()').extract()[0] item['author'] = response.xpath( '//div[@class="book-info "]/h1/span/a/text()').extract()[0] item['status'] = response.xpath( '//span[@class="blue"]/text()').extract()[0] classname = response.xpath( '//div[@class="book-info "]/p[3]/em[1]/span/@class').extract()[0] url = "https://qidian.gtimg.com/qd_anti_spider/{}.woff".format( classname) resp = response.body.decode('utf-8') pattern = re.compile( '</style><span class="\w+">(.*?);</span></em><cite>') word_list = pattern.search(resp).group(1).split(';') word_count = parse_font(url, word_list) item['word_count'] = word_count + '万字' item['ticket'] = response.xpath( '//*[@id="monthCount"]/text()').extract_first() yield item
def parse_page(self, response): selector = Selector(response) item = QidianItem() item['url'] = response.url #book image book_img = selector.xpath('//a[@id="bookImg"]/img') item['image'] = 'http:' + book_img[0].xpath( '@src').extract()[0].strip() #book info book_info = selector.xpath('//div[@class="book-info "]') item['name'] = book_info.xpath('h1/em//text()').extract()[0] item['author'] = book_info.xpath('h1/span/a/text()').extract()[0] item['intro'] = selector.xpath( '//div[@class="book-intro"]/p/text()').extract()[0] item['intro'] = item['intro'].replace('\n', '') item['intro'] = item['intro'].replace('\r', '') item['intro'] = item['intro'].replace('\t', ' ') item['intro'] = item['intro'].strip() tmp = book_info.xpath('p') book_tags = tmp[0] book_statistics = tmp[2].xpath('em/text()') book_statistics_desc = tmp[2].xpath('cite//text()') assert len(book_statistics_desc) == 7 #book status tag_spans_texts = book_tags.xpath('span/text()') item['sign_status'] = u'未签' for book_tags_span in tag_spans_texts: status = book_tags_span.extract() if status == u'连载' or status == u'完本': item['progress'] = status elif status == u'签约': item['sign_status'] = status elif status == u'VIP' or status == u'免费': item['pay_status'] = status #book category book_tags_hrefs = book_tags.xpath('a/text()') item['major_category'] = book_tags_hrefs[0].extract() item['minor_category'] = book_tags_hrefs[1].extract() #book statistics text_count_desc = book_statistics_desc[0].extract() click_count_desc = book_statistics_desc[1].extract() weekly_click_count_desc = book_statistics_desc[3].extract() recommend_count_desc = book_statistics_desc[4].extract() weekly_recommend_count_desc = book_statistics_desc[6].extract() total_text_count = float(book_statistics[0].extract()) if text_count_desc[0] == u'万': total_text_count = total_text_count * 10000.0 total_text_count = int(total_text_count) total_click_count = float(book_statistics[1].extract()) if click_count_desc[0] == u'万': total_click_count = total_click_count * 10000.0 total_click_count = int(total_click_count) #会员周点击3.25万 beg_pos = weekly_click_count_desc.find(u'会员周点击') + len(u'会员周点击') end_pos = weekly_click_count_desc.find(u'万', beg_pos) adjust_end_pos = end_pos if adjust_end_pos < 0: adjust_end_pos = len(weekly_click_count_desc) vip_weekly_click_count = float( weekly_click_count_desc[beg_pos:adjust_end_pos]) if end_pos > 0: vip_weekly_click_count = vip_weekly_click_count * 10000.0 vip_weekly_click_count = int(vip_weekly_click_count) toal_recommend_count = float(book_statistics[2].extract()) if recommend_count_desc[0] == u'万': toal_recommend_count = toal_recommend_count * 10000.0 toal_recommend_count = int(toal_recommend_count) #周13.52万 beg_pos = weekly_recommend_count_desc.find(u'周') + len(u'周') end_pos = weekly_recommend_count_desc.find(u'万', beg_pos) adjust_end_pos = end_pos if adjust_end_pos < 0: adjust_end_pos = len(weekly_recommend_count_desc) weekly_recommend_count = float( weekly_recommend_count_desc[beg_pos:adjust_end_pos]) if end_pos > 0: weekly_recommend_count = weekly_recommend_count * 10000.0 weekly_recommend_count = int(weekly_recommend_count) item['total_text_count'] = total_text_count item['total_click_count'] = total_click_count item['vip_weekly_click_count'] = vip_weekly_click_count item['toal_recommend_count'] = toal_recommend_count item['weekly_recommend_count'] = weekly_recommend_count try: item['monthly_pass_count'] = selector.xpath( '//i[@id="monthCount"]/text()').extract()[0] except: item['monthly_pass_count'] = 0 item['weekly_reward_count'] = selector.xpath( '//i[@id="rewardNum"]/text()').extract()[0] #from ajax ''' self.browser.get(response.url) page = self.browser.page_source selector = Selector(text=page) score_lhs = selector.xpath('//cite[@id="score1"]/text()').extract()[0] score_rhs = selector.xpath('//i[@id="score2"]/text()').extract()[0] item['score'] = score_lhs + '.' + score_rhs item['evaluate_users'] = selector.xpath('//p[@id="j_userCount"]/span/text()').extract()[0] ''' yield item
def catalog_item(self, response): l = ItemLoader(item=QidianItem(), response=response) l.add_xpath('title', '//div[@class="new_charpet"]/a/text()') l.add_xpath('src', '//div[@class="new_charpet"]/a/@href') return l.load_item()