def get_chapterurl(self, response): # print(response.text) item = DingdianItem() # # response.meta[key]:这个是提取从上一个函数传递下来的值。 # item['name'] = str(response.meta['name']).replace('\xa0', '') item['name'] = response.meta['name'] item['novelurl'] = response.meta['url'] soup = BeautifulSoup(response.text, 'lxml') # html里的meta, 获取方式,不能用find,find_all """ <meta name="og:novel:category" content="玄幻魔法"/> <meta name="og:novel:author" content="横扫天涯"/> <meta name="og:novel:book_name" content="天道图书馆"/> <meta name="og:novel:read_url" content="https://www.x23us.com/html/67/67025/"/> """ category = soup.find(attrs={'name': 'og:novel:category'})['content'] author = soup.find(attrs={'name': 'og:novel:author'})['content'] bash_url = soup.find(attrs={'name': 'og:novel:read_url'})['content'] # bash_url = BeautifulSoup(response.text, 'lxml').find('p', class_='btnlinks').find('a', class_='read')['href'] name_id = str(response.url)[-6:-1].replace('/', '') item['category'] = str(category).encode('UTF-8') item['author'] = str(author) item['name_id'] = name_id # 返回item是不能用return的哦!用了就结束了,程序就不会继续下去了,得用yield yield item yield Request(url=bash_url, callback=self.get_chapter, meta={'name_id': name_id})
def get_chapterurl(self, response): print("==========get_chapterurl===============") item = DingdianItem() chaptSoup = BeautifulSoup(response.text, 'lxml') item['name'] = response.meta['name'].replace(chr(0xa0), '') item['author'] = chaptSoup.find('table').find_all( 'td')[1].get_text().replace(chr(0xa0), '') item['novelurl'] = response.meta['url'] item['serialnumber'] = chaptSoup.find('table').find_all( 'tr')[1].find_all('td')[1].get_text().replace(chr(0xa0), '') item['serialstatus'] = chaptSoup.find('table').find_all( 'tr')[0].find_all('td')[2].get_text().replace(chr(0xa0), '') item['category'] = chaptSoup.find('table').find('a').get_text() #latestChapterUrl = chaptSoup.find('p', class_='btnlinks').find('a', class_="read")['href'] #item['novel_id'] = latestChapterUrl[-16:-11] 当id位数不一样时,有问题 item['novel_id'] = int( re.findall(r"xiaoshuo/(.+?).html", item['novelurl'])[0]) # 如果只需要实现以上的功能,必须有return,否则执行不到pipelines.py # return item yield item latestChapterUrl = chaptSoup.find('p', class_='btnlinks').find( 'a', class_="read")['href'] yield Request(url=latestChapterUrl, callback=self.get_chapter, meta={"novel_id": item['novel_id']})
def parse_books(self, response): #解析书籍概述等首页 item = DingdianItem() item['category'] = response.xpath( '//tr/td[1]/a/text()').extract_first() #小说类别 item['book_author'] = response.xpath( '//tr[1]/td[2]/text()').extract_first().strip() #小说作者去掉前后空格 item['book_id'] = re.findall(r'/[0-9]+', response.url)[0][1:] item['book_name'] = response.xpath( '//*[@id="content"]/dd[1]/h1/text()').extract_first()[:-4] #小说名字 item['book_status'] = response.xpath( '//tr[1]/td[3]/text()').extract_first().strip() #小说状态 item['book_url'] = response.url item['clicks'] = response.xpath( '//tr[3]/td[1]/text()').extract_first().strip() #总点击数 item['recommend'] = response.xpath( '//tr[4]/td[1]/text()').extract_first().strip() #总推荐数 item['book_img_url'] = response.xpath( '//*[@id="content"]/dd[2]/div[1]/a/img/@src').extract_first() item['summary'] = response.xpath('//dd/p[2]').extract_first() #简介 item['length'] = response.xpath( '//tr[2]/td[2]/text()').extract_first().strip() #总字数 item['latest_update_time'] = response.xpath( '//tr[2]/td[3]/text()').extract_first().strip() #最后更新时间 item['flag'] = 1 catalog = response.xpath( '//div/p[2]/a[1]/@href').extract_first() #书籍目录 url_md5 = hashlib.md5(item['book_url'].encode('gb2312')).hexdigest() if self.r.sadd('books', url_md5): yield item yield scrapy.Request(url=catalog, callback=self.parse_catalog)
def parse_details(self, response): book_name = response.xpath('//h1/text()').extract_first() book_anthor = response.xpath( '//table[@id="at"]/tr[1]/td[2]/text()').extract_first() book_type = response.xpath( '//table[@id="at"]/tr[1]/td[1]/a/text()').extract_first() book_status = response.xpath( '//table[@id="at"]/tr[1]/td[3]/text()').extract_first() book_words = response.xpath( '//table[@id="at"]/tr[2]/td[2]/text()').extract_first() book_time = response.xpath( '//table[@id="at"]/tr[2]/td[3]/text()').extract_first() book_click_nums = response.xpath( '//table[@id="at"]/tr[3]/td[1]/text()').extract_first() item = DingdianItem() item['book_name'] = book_name item['book_anthor'] = book_anthor item['book_status'] = book_status item['book_type'] = book_type item['book_words'] = book_words item['book_time'] = book_time item['book_click_nums'] = book_click_nums yield item
def get_chapterurl(self, response): # 实例化 item = DingdianItem() # \xao是为了替换空字符  item['name'] = str(response.meta['name']).replace('\xa0', '') item['novelurl'] = response.meta['url'] all_start = BeautifulSoup(response.text, 'lxml').find('table', id='at') category = all_start.find('a').get_text() author = all_start.find_all('td')[1].get_text().replace('\xa0', '') chapter_list_url = BeautifulSoup(response.text, 'lxml').find('a', class_='read')['href'] # http://www.23us.com/book/ = 25 得到尾部数字 length = len(item['novelurl']) - 25 name_id = item['novelurl'][-length:] item['category'] = category item['author'] = author item['name_id'] = name_id yield item yield Request(chapter_list_url, callback=self.get_chapter, meta={ 'name_id': name_id, 'chapter_list_url': chapter_list_url })
def get_name(self, response): url_node = response.xpath("//tr[@bgcolor='#FFFFFF']") for node in url_node: item = DingdianItem() item['novel_name'] = node.xpath(".//a/@title").extract()[0] item['novel_url'] = node.xpath(".//a/@href").extract()[1] print(item) #print(novel_url) yield item
def get_novelcontent(self, response): #targentcontent=response.meta['targentcontent'] #print targentcontent['novelurl'],targentcontent['name'] #title = response.xpath('//dd[1]/h1/text()').extract_first() novel_instroduce_url = response.url #小说地址 novel_name = response.meta['name'] #小说名字 chapterlisturl = response.meta['chapterlisturl'] #章节列表地址 author = response.xpath( '//table/tr[1]/td[2]/text()').extract_first() #作者 serialstatus = response.xpath( '//table/tr[1]/td[3]/text()').extract_first() #状态 serialnumber = response.xpath( '//table/tr[2]/td[2]/text()').extract_first() #连载字数 category = response.xpath( '//table/tr[1]/td[1]/a/text()').extract_first() #小说类别 name_id = chapterlisturl.split('/')[-1] #小说编号 collect_num_total = response.xpath( '//table/tr[2]/td[1]/text()').extract_first() #总收藏 click_num_total = response.xpath( '//table/tr[3]/td[1]/text()').extract_first() #总点击 #chapterlistul=response.xpath('//dd[2]/div[2]/p[2]/a/text()').extract_first() #chapterlisturl=response.xpath('//dd[2]/div[2]/p[2]/a/@href').extract_first() novel_breif = response.xpath('//dd[2]/p[2]').extract_first() ''' print('novel_instroduce_url = %s' % novel_instroduce_url) print('chapterlisturl = %s' % chapterlisturl) print('author = %s' % len(author)) print('serialstatus = %s' % len(serialstatus)) print('serialnumber = %s' % len(serialnumber)) print('category = %s' % len(category)) print('name_id = %s' % name_id) print('collect_num_total = %s' % int(collect_num_total)) print('click_num_total = %s' % int(click_num_total)) ''' targentcontent = DingdianItem() targentcontent['novel_name'] = novel_name targentcontent['author'] = author targentcontent['novel_instroduce_url'] = novel_instroduce_url targentcontent['novelurl'] = chapterlisturl targentcontent['serialstatus'] = serialstatus targentcontent['serialnumber'] = serialnumber targentcontent['category'] = category targentcontent['name_id'] = name_id targentcontent['collect_num_total'] = collect_num_total targentcontent['click_num_total'] = click_num_total targentcontent['novel_breif'] = novel_breif #print(u'novel_name=%s,author=%s,novel_instroduce=%s,serialstatus=%s,serialnumber=%s,category=%s,name_id=%s,collect_num_total=%s,click_num_total=%s,chapterlisturl=%s' % (novel_name,author,novel_instroduce,serialstatus,serialnumber,category,name_id,collect_num_total,click_num_total,chapterlisturl)) #yield targentcontent yield Request(chapterlisturl, dont_filter=True, callback=self.get_charaterurl, meta={'targentcontent': targentcontent})
def get_chapterurl(self, response): item = DingdianItem() item['name'] = str(response.meta['name']).replace('\xa0', 'td') item['novelurl'] = response.meta['url'] category = BeautifulSoup(response.text, 'lxml').find('table').find('a').get_text() author = BeautifulSoup(response.text, 'lxml').find('table').find('td')[1].get_text() bash_url = BeautifulSoup(response.text, 'lxml').find('p', class_='btnlinks').fine('a', class_='read')['href'] name_id = str(bash_url)[-6:-1].replace('/','') item['category'] = str(category).replace('/','') item['author'] = str(author).replace('/','') item['name_id'] = name_id return item
def chapterurl(self, response): item = DingdianItem() item['name'] = response.meta['name'] item['novelurl'] = response.meta['url'] item['author'] = response.meta['author'] item['category'] = BeautifulSoup(response.text, 'lxml').find('table', bgcolor='#E4E4E4').find('a').get_text() bash_url = BeautifulSoup(response.text, 'lxml').find('p', class_='btnlinks').find('a', class_='read')['href'] name_id = str(bash_url)[-6:-1].replace('/', '') item['name_id'] = name_id # item['serialstatus'] = response.meta['status'] yield item yield Request(bash_url, callback=self.get_chapter, meta={'name_id':name_id})
def get_chapturl(self, response): item = DingdianItem() item['name'] = str(response.meta['name']).replace('\xa0', '') item['novelurl'] = response.meta['url'] category = BeautifulSoup( response.text, 'lxml').find('dt').find_all('a')[5].get_text() author_info = BeautifulSoup(response.text, 'lxml').find('h3').get_text() name_id = ''.join(response.meta['url'].split('/')[-3:-1]) item['category'] = category item['author'] = re.split(r'[:\xa0;]', author_info)[1] item['name_id'] = name_id return item
def get_chapterurl(self, response): item = DingdianItem() item['name'] = str(response.meta['name']).replace('\xa0', '') item['novelurl'] = response.meta['url'] # category=BeautifulSoup(response.text,'lxml').find('table').find('a').get_text() td_list = BeautifulSoup(response.text, 'lxml').find('table').find_all('td') item['category'] = str(td_list[0].get_text()).replace('\xa0', '') item['author'] = str(td_list[1].get_text()).replace('\xa0', '') item['serialstatus'] = str(td_list[2].get_text()).replace('\xa0', '') item['serialnumber'] = str(td_list[4].get_text()).replace('\xa0', '') item['name_id'] = str(response.url)[-10:-5].replace('/', '') return item
def get_chapterurl(self, response): # response = requests.get(novelurl) soup = BeautifulSoup(response.text, 'lxml') item = DingdianItem() item['name'] = response.meta['name'] item['novelurl'] = response.meta['url'] item['name_id'] = response.url[-6:].replace('/', '') item['category'] = soup.find('table').find_all('td')[0].get_text() item['author'] = soup.find('table').find_all('td')[1].get_text() item['serialstatus'] = soup.find('table').find_all('td')[2].get_text() item['serialnumber'] = soup.find('table').find_all('td')[4].get_text() item['lastUpdate'] = soup.find('table').find_all('td')[5].get_text() return item
def get_chapterurl(self, response): item = DingdianItem() item['name'] = str(response.meta['name']).replace('\xa0', '') item['novelurl'] = response.meta['url'] soup = BeautifulSoup(response.text, 'lxml') #meta 获取方式,不能用find,find_all category = soup.find(attrs={'name':'og:novel:category'})['content'] author = soup.find(attrs={'name':'og:novel:author'})['content'] name_id = str(response.url)[-6:-1].replace('/', '') item['category'] = str(category) item['author'] = str(author) item['name_id'] = name_id yield item yield Request( response.url, callback=self.get_chapter, meta={'name':name_id} )
def parse(self, response): for ms in response.xpath("//div[contains(@class,'i_w')]"): item = DingdianItem() title = ms.xpath("div/div/strong/text()").extract_first() hot = ms.xpath("div/div/span/text()").extract_first() item["title"] = title item["hot"] = hot yield item next_page = response.xpath("//a[@class='next']/@href").extract_first() print("下一页:", next_page) if next_page: yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
def get_chapterurl(self, response): item = DingdianItem() item['name'] = str(response.meta['name']).replace('\xa0', '') item['novelurl'] = response.meta['url'] soup = BeautifulSoup(response.text, 'lxml') category = soup.find('table').find('a').get_text() author = soup.find('table').find_all('td')[1].get_text() base_url = soup.find('p', class_='btnlinks').find('a', class_='read')['href'] name_id = str(base_url)[-6:-1].replace('/', '') item['category'] = str(category).replace('/', '') item['author'] = str(author).replace('/', '') item['name_id'] = str(name_id).replace('/', '') yield item # 这个返回的是一个item的生成器,可以是用next(函数) 进行遍历
def content_html(self, response): item = DingdianItem() # 引入定义存数据的item文件 # 找到文章的标题 title1 = response.xpath( '//*[@id="amain"]/dl/dd[1]/h1/text()').extract()[0] item['book'] = response.meta['title'] # 书名 item['article_title'] = title1 # 文章的标题 item['author'] = response.meta['author'] # 作者 content_all = '' # 定一个空字符串 用来接受你得到的数据 # 文章的内容 返回一个列表 mysql不能直接存列表 content_con = response.xpath('//*[@id="contents"]/text()').extract() for i in content_con: # 遍历你的列表 content_all = content_all + i.strip() # 得到内容后 去除空格 加入到空字符串 item['content'] = content_all # 得到你的内容 yield item
def get_chapter(self, response): item = DingdianItem() item['name'] = response.meta['novel_name'] item['novelurl'] = response.meta['novel_url'] print(item['name'], item['novelurl']) author = response.xpath('//div[@class="block_txt2"]/p/a').extract()[1] author = re.search(r'>(.*?)</a>', author).group(1) print(author) category = response.xpath('//div[@class="block_txt2"]/p/a').extract()[2] category = re.search(r'>(.*?)</a>', category).group(1) print(category) name_id = str(response.url.split(r'/')[-1]) print(name_id) item['author'] = str(author) item['category'] = str(category) item['name_id'] = str(name_id) yield item
def get_chapterurl(self,response): item = DingdianItem() item['name'] = str(response.meta['name']).replace('\xa0', '') item['novelurl'] = response.meta['url'] item['name_id'] = response.meta['name_id'] bs = BeautifulSoup(response.text, 'lxml').find('table',id='at').find_all('td') category = bs[0].get_text() author = bs[1].get_text() serialstatus = bs[2].get_text() sn = td[4].get_text() serialnumber =re.split(r'\D+',str(sn))[1] item['category'] = str(category).replace('/', '') item['author'] = str(author).replace('/', '') item['serialstatus'] = str(serialstatus).replace('/', '') item['serialnumber'] = str(serialnumber) yield item yield Request(url=bash_url, callback=self.get_chapter, meta={'name_id': name_id})
def get_chapterurl(self, response): #response.encoding = 'gbk' item = DingdianItem() item['name'] = str(response.meta['name']).replace('\xa0', '') item['novel_url'] = response.meta['url'] category = BeautifulSoup(response.text, 'lxml').find( 'div', style='width:550px;').find('a').get_text().strip() author = BeautifulSoup(response.text, 'lxml').find( 'div', style='width:550px;').find_all('td')[1].get_text().strip() name_id = BeautifulSoup(response.text, 'lxml').find( 'p', class_='btnlinks').find('a', class_='read')['href'][-6:-1].replace( '/', '') item['category'] = str(category) item['author'] = str(author) item['name_id'] = str(name_id) return item
def parse(self, response): lists = [] # 先建立一个列表,用来保存每一页的信息 # 通过观察我们看到该页面所有影片的信息都位于一个class属性为list-unstyled vod-item-img ff-img-215的 ul 标签内的 li 标签内。 movies = response.xpath( '//ul[@class="list-unstyled vod-item-img ff-img-215"]/li') for movie in movies: list = DingdianItem() # 申请一个weatheritem 的类型来保存结果 # 为什么要用.extract()[0],是因为.xpath 返回的是一个列表,我们是获取里面的内容 list['name'] = movie.xpath( './/p[@class="image"]//img/@alt').extract()[0] list['img'] = movie.xpath( './/p[@class="image"]//img/@data-original').extract()[0] list['movie'] = 'http://nlook1.cn' + movie.xpath( './/p[@class="image"]/a/@href').extract()[0] lists.append(list) # 添加到 lists 列表中 return lists # 一定要有这个返回 lists ,因为之后我们要将数据下载到本地,没有的话,就下载保存不了的
def get_charpter_url(self, response): item = DingdianItem() bs = BeautifulSoup(response.text, 'lxml') item['name'] = response.meta['name'] item['novel_url'] = response.meta['url'] item['category'] = bs.find('table').find('a').get_text() item['author'] = bs.find('table').find_all('td')[1].get_text().replace('\xa0', '') item['novel_id'] = bs.find('p', class_='btnlinks').find('a', class_='read')['href'][-6:-1].replace('/', '') item['serial_status'] = bs.find('table').find('tr').find_all('td')[-1].get_text().replace('\xa0', '') item['serial_length'] = bs.find('table').find_all('tr')[1].find_all('td')[1].get_text().replace('\xa0', '') bash_url = bs.find('p', class_='btnlinks').find('a', class_='read')['href'] ret = Sql.select_name(item['novel_id']) if ret == 1: print("该小说已存在") else: yield item yield Request(bash_url, callback=self.get_chapter, meta={'novel_id': item['novel_id']})
def get_chapterurl(self, response): Item = DingdianItem() Item["name"] = str(response.meta["name"]).replace("\xa0", "") Item["novelUrl"] = response.meta["url"] htmlsoup = BeautifulSoup(response.text, "lxml") categroy = htmlsoup.find("table").find("a").get_text() author = htmlsoup.find("tr").find_all("td")[1].get_text() bash_url = htmlsoup.find("p", class_="btnlinks").find("a", class_="read")["href"] name_id = bash_url.split("/")[-2] Item["category"] = str(categroy).replace("\a0", "") Item["author"] = str(author).replace("\a0", "") Item["name_id"] = name_id yield Item yield Request(bash_url, callback=self.get_chapter, meta={"name": name_id})
def get_chapterurl(self, response): item = DingdianItem() item['name'] = str(response.meta['name']).replace('\xa0', '') item['novelurl'] = response.meta['url'] category = BeautifulSoup(response.text, 'lxml').find('table').find('a').get_text() author = BeautifulSoup( response.text, 'lxml').find('table').find_all('td')[1].get_text() bash_url = BeautifulSoup(response.text, 'lxml').find( 'p', class_='btnlinks').find('a', class_='read')['href'] name_id = str(bash_url)[-6:-1].replace('//', '') item['category'] = str(category).replace('//', '') item['author'] = str(author).replace('\xa0', '') item['name_id'] = name_id yield item yield Request(url=bash_url, callback=self.get_chapter, meta={'name_id': name_id})
def get_chapterurl(self, response): item = DingdianItem() # 小说名称 item['name'] = response.meta['name'] # 小说详情页 item['novelurl'] = response.meta['url'] # 分类 item['category'] = BeautifulSoup( response.text, 'lxml').find('table').find('a').get_text() # 作者 item['author'] = BeautifulSoup( response.text, 'lxml').find('table').find_all('td')[1].get_text().replace( "\xa0", '') # 最新章节 item['new'] = BeautifulSoup(response.text, 'lxml').find( 'p', class_='btnlinks').find('a', class_='read')['href'] return item
def get_chapterurl(self, response): item = DingdianItem() item['name'] = response.meta['name'] item['novelurl'] = response.meta['url'] item['author'] = response.xpath( '//tr[1]/td[2]/text()').extract_first().strip() item['serialstatus'] = response.xpath( '//tr[1]/td[3]/text()').extract_first().strip() item['serialnum'] = response.xpath( '//tr[2]/td[2]/text()').extract_first().strip() item['category'] = response.xpath( '//tr[1]/td[1]/a/text()').extract_first().strip() find_name_id = re.compile('\/(\d+)\.', re.S) name_id = re.findall(find_name_id, response.meta['url'])[0] item['name_id'] = name_id #print(item['name'],item['author'],item['serialstatus'],item['serialnum'],item['category']) self.count += 1 print(self.count) return item
def parse_detail(self, response): soup = BeautifulSoup(response.text) table = soup.find('table') trs = table.find_all('tr') auth = trs[0].find_all('td')[1].get_text().replace( '\xa0', '') # \xa0 就是 html 中的 status = trs[0].find_all('td')[2].get_text().replace('\xa0', '') words = trs[1].find_all('td')[1].get_text().replace('\xa0', '') last_time = trs[1].find_all('td')[2].get_text().replace('\xa0', '') item = DingdianItem() item['book_name'] = response.meta['book_name'] item['new_chapter'] = response.meta['new_chapter'] item['auth'] = auth item['status'] = status item['words'] = words item['last_time'] = last_time return item
def get_chapterurl(self, response): # 创建个DingdianItem对象把我们爬取的东西放进去 item = DingdianItem() item['name'] = str(response.meta['name']).replace( u'\xa0', u'') #前置替换动作,因为unicode中的‘\xa0’字符在转换成gbk编码时会出现问题 item['novelurl'] = response.meta['url'] soup = BeautifulSoup(response.text, 'lxml') category = soup.table.a.get_text() author = soup.table.find_all('td')[1].get_text() bash_url = soup.find('p', class_='btnlinks').find('a', class_='read')['href'] name_id = str(bash_url)[-6:-1].replace(u'/', u'') item['category'] = str(category).replace(u'/', u'') item['author'] = str(author).replace(u'/', u'') item['name_id'] = name_id yield item yield Request(bash_url, callback=self.get_chapter, meta={'name_id': name_id})
def get_chapterurl(self, response): item = DingdianItem() item['name'] = ((response.meta['name']).replace('\xa0', '')) item['novelurl'] = response.meta['url'].encode('utf-8') category = BeautifulSoup(response.text, 'lxml').find('table').find('a').get_text() author = BeautifulSoup( response.text, 'lxml').find('table').find_all('td')[1].get_text() base_url = BeautifulSoup(response.text, 'lxml').find( 'p', class_='btnlinks').find('a', class_='read')['href'] # name_id = ((base_url)[-11:-5].replace('/', '')).encode('utf-8') pattern = re.compile(r'\d+') # name_id = pattern.findall(str(base_url))[1].encode('utf-8') name_id = pattern.findall(str(response.meta['url']))[1] item['category'] = ((category).replace('/', '')) item['author'] = (str(author).replace('\xa0', '')) item['name_id'] = name_id # return item yield item
def get_all(self, response): ''' 处理页面,匹配各项内容并返回item字典 :param response: :return: ''' item = DingdianItem() html = response.text name = BeautifulSoup(html, 'lxml').find('h1').get_text().split()[0] novelurl = BeautifulSoup(html, 'lxml').find('a', class_='read')['href'] bs_table = BeautifulSoup(html, 'lxml').find('table') author = bs_table.find_all('td')[1].get_text().split()[0] status = bs_table.find_all('td')[2].get_text().split()[0] number = bs_table.find_all('td')[4].get_text().split()[0][:-1] category = bs_table.find_all('td')[0].get_text().split()[0] name_id = re.findall('down/(\d+)', html)[0] item['name'] = name item['author'] = author item['novelurl'] = novelurl item['status'] = status item['number'] = number item['category'] = category item['name_id'] = name_id return item
def get_chapterurl(self, response): item = DingdianItem() item['name'] = str(response.meta['name']).replace('\xa0', '') item['novelurl'] = response.meta['url'] item['category'] = response.meta['cat'] author = BeautifulSoup(response.text, 'lxml').find( id='info').find('p') author = str(author.get_text()).split(':')[1] item['author'] = author name_id = str(response.url)[-6:-1].replace('/', '') item['name_id'] = name_id yield item dds = BeautifulSoup(response.text, 'lxml').find_all('dd') num = 0 for novel in dds: num = num + 1 url = response.url + novel.find('a')['href'] chapter_title = novel.find('a').get_text() rets = Sql.select_chapter(url) if rets[0] == 1: print('the chapter is exsits!') pass else: yield Request(url, callback=self.get_chapter, meta={'num': num, 'name_id': name_id, 'chaptername': chapter_title, 'chapterurl': url})