Пример #1
0
 def parse_mark(self, response):
     mark = None
     index = 1
     result = get_select_first_str(response, "/html/body/div[5]/div[1]/div[6]/p/a[position()='1']/text()", None)
     if result != None:
         while True:
             r = get_select_first_str(response, "/html/body/div[5]/div[1]/div[6]/p/a[position()=" + str(index) + "]/text()", None)
             if r == None:
                 break
             index = index + 1
             if mark == None:
                 mark = r
             else:
                 mark = mark + ',' + r
     else:
         result = get_select_first_str(response, "/html/body/div[5]/div[1]/div[5]/p/a[position()='1']/text()", None)
         index = 1
         if result != None:
             while True:
                 r = get_select_first_str(response, "/html/body/div[5]/div[1]/div[5]/p/a[position()=" + str(index) + "]/text()", None)
                 if r == None:
                     break
                 index = index + 1
                 if mark == None:
                     mark = r
                 else:
                     mark = mark + ',' + r
     return mark
Пример #2
0
 def parse(self, response):
     elements = response.xpath(
         "//div[@class='mBox']//div[@class='bd']//ul[@class='clearfix wenList']//li"
     )
     if len(elements) <= 0:
         print('============================')
         print(response.text)
         return
     e = elements[0]
     for i in range(len(elements)):
         j = i + 1
         print(str(j))
         head = "//div[@class='mBox']//div[@class='bd']//ul[@class='clearfix wenList']//li[position()=" + str(
             j) + "]"
         href = get_select_first_str(e, head + "//a[position()=1]//@href",
                                     None)
         if href is not None:
             content_url = "http:" + href
             yield response.follow(content_url, callback=self.content)
     next_url = get_select_first_str(
         response,
         "//div[@class='mBox']//div[@class='bd']//div[@class='pager']//a[@class='page' and text()='下一页']/@href",
         None)
     if next_url is not None:
         next_url = "http:" + next_url
         yield response.follow(next_url, callback=self.parse)
Пример #3
0
 def parse(self, response):
     type = 0
     if response.request.meta['type'] is None:
         type = get_joke_type(response.url)
     else:
         type = response.request.meta['type']
     page_url_head = None
     if response.request.meta['page_url_head'] is None:
         page_url_head = get_joke_page_url_head(response.url)
     else:
         page_url_head = response.request.meta['page_url_head']
     if page_url_head == None:
         print(response.text)
         return
     elements = response.xpath("//*[@id='main']/div/div[1]/div[2]/ul/li")
     if len(elements) <= 0:
         return
     e = elements[0]
     for i in range(len(elements)):
         j = i + 1
         print(str(j))
         head = "//*[@id='main']/div/div[1]/div[2]/ul/li[position()=" + str(
             j) + "]"
         href = get_select_first_str(e, head + "/div/h2/a/@href", None)
         title = get_select_first_str(e, head + "/div/h2/a/text()", None)
         if href is not None:
             content_url = "http://www.jdjhj.com" + href
             yield response.follow(content_url,
                                   callback=self.content,
                                   meta={
                                       'type': type,
                                       'title': title
                                   })
     next_url = get_select_first_str(
         response, "//*[@id='pager']/ul/li/a[text()='下一页']/@href", None)
     if next_url is not None:
         next_url = page_url_head + "/" + next_url
         print(next_url)
         yield scrapy.Request(next_url,
                              callback=self.parse,
                              meta={
                                  'type': type,
                                  'page_url_head': page_url_head
                              })
Пример #4
0
 def parse(self, response):
     type = 0
     if response.request.meta['type'] is None:
         type = get_source_type(response.url)
     else:
         type = response.request.meta['type']
     page_url_head = None
     if response.request.meta['page_url_head'] is None:
         page_url_head = get_picture_page_url_head(response.url)
     else:
         page_url_head = response.request.meta['page_url_head']
     elements = response.xpath("//*[@id='bqblist']/a")
     if len(elements) <= 0:
         return
     e = elements[0]
     for i in range(len(elements)):
         j = i + 1
         print(str(j))
         head = "//*[@id='bqblist']/a[position()=" + str(j) + "]"
         href = get_select_first_str(e, head + "/@href", None)
         title = get_select_first_str(e, head + "/@title", None)
         if href is not None:
             href = "https://www.fabiaoqing.com" + href
             content_url = href
             pictures = []
             has_error = 'false'
             yield response.follow(content_url, callback = self.content, meta = {
                 'type':type, 
                 'group_url':content_url, 
                 'stage':'content', 
                 'pictures':pictures, 
                 'has_error':has_error
                 })
     next_url = get_select_first_str(response, "//*[@id='bqblist']/div[@class='ui pagination menu']/a[contains(text(), '下一页')]/@href", None)
     if next_url is not None:
         next_url = page_url_head + next_url
         yield response.follow(next_url, callback = self.parse, meta = {
             'type':type, 
             'page_url_head':page_url_head, 
             'stage':'page'
             })
Пример #5
0
 def parse(self, response):
     type = 0
     if response.request.meta['type'] is None:
         type = get_source_type(response.url)
     else:
         type = response.request.meta['type']
     elements = response.xpath("/html/body/div[5]/div[1]/ul/li")
     if len(elements) <= 0:
         return
     e = elements[0]
     for i in range(len(elements)):
         j = i + 1
         print(str(j))
         head = "/html/body/div[5]/div[1]/ul/li[position()=" + str(j) + "]"
         href = get_select_first_str(e, head + "//a[position()=1]//@href", None)
         if href is not None:
             content_url = href
             picture_urls = []
             has_error = 'false'
             yield response.follow(content_url, callback = self.content, meta = {'type':type, 'group_url':content_url, 'stage':'content', 'picture_urls':picture_urls, 'has_error':has_error})
     next_url = get_select_first_str(response, "/html/body/div[5]/div[1]/div/a[text()='下一页']/@href", None)
     if next_url is not None:
         yield response.follow(next_url, callback = self.parse, meta = {'type':type, 'stage':'page'})
Пример #6
0
 def parse(self, response):
     type = 0
     if response.request.meta['type'] is None:
         type = get_joke_type(response.url)
     else:
         type = response.request.meta['type']
     elements = response.xpath(
         "//*[@id='main']/div/div[@class='line1' or @class='line2']")
     if len(elements) <= 0:
         return
     e = elements[0]
     for i in range(len(elements)):
         j = i + 1
         print(str(j))
         head = "//*[@id='main']/div/div[(@class='line1' or @class='line2') and position()=" + str(
             j) + "]"
         href = get_select_first_str(e, head + "/a/@href", None)
         title = get_select_first_str(e, head + "/a/text()", None)
         if href is not None:
             content_url = href
             yield response.follow(content_url,
                                   callback=self.content,
                                   meta={
                                       'type': type,
                                       'title': title
                                   })
     next_url = get_select_first_str(
         response,
         "//*[@id='main']/div/div[@class='pgs cl']/div[@class='pg']/a[text()='下一页']/@href",
         None)
     if next_url is not None:
         print('###########################')
         print(next_url)
         yield scrapy.Request(next_url,
                              callback=self.parse,
                              meta={'type': type})
Пример #7
0
 def content(self, response):
     item = JokeItem()
     title = response.request.meta['title']
     if title != None:
         item['title'] = title.strip()
     else:
         item['title'] = None
     joke_type = title = response.request.meta['type']
     if joke_type != None:
         item['type'] = joke_type
     else:
         item['type'] = None
     text = get_select_first_str(
         response, "//*[@id='main']/div/div[1]/div[2]/div[1]/p/span/span",
         None)
     if text == None:
         text = get_select_first_str(
             response, "//*[@id='main']/div/div[1]/div[2]/div[1]/p", None)
     item['text'] = text
     item['crawl_origin'] = '天天搞笑网'
     item['crawl_url'] = response.url
     print(concat_str('笑话标题:', title))
     print(concat_str('内容', text))
     return item
Пример #8
0
 def parse(self, response):
     type = 0
     if response.request.meta['type'] is None:
         type = get_source_type(response.url)
     else:
         type = response.request.meta['type']
     group_url = None
     if response.request.meta['group_url'] is None:
         group_url = None
     else:
         group_url = response.request.meta['group_url']
     pictures = []
     if response.request.meta['pictures'] is not None:
         pictures = response.request.meta['pictures']
     has_error = 'false'
     if response.request.meta['has_error'] is not None:
         has_error = response.request.meta['has_error']
     title = get_select_first_str(response,
                                  "//*[@id='bqb']/div[1]/h1/text()", None)
     if title != None:
         title = title.strip()
     images = response.xpath(
         "//*[@id='bqb']//div[1]//div[1]//div//div//div[@class='bqppdiv1']")
     if images == None or len(images) == 0:
         has_error = 'true'
     else:
         for i in range(len(images)):
             j = i + 1
             print(str(j))
             head = "//*[@id='bqb']//div[1]//div[1]//div//div//div[position()=" + str(
                 j) + "]"
             src = get_select_first_str(response,
                                        head + "//img//@data-original",
                                        None)
             description = get_select_first_str(response,
                                                head + "//img//@title",
                                                None)
             picture = {}
             picture['url'] = src
             picture['description'] = description
             pictures.append(picture)
     marks = response.xpath('//*[@id="bqb"]/div[1]/div[2]/a')
     mark = ''
     if marks == None or len(marks) == 0:
         mark = None
     else:
         for i in range(len(marks)):
             j = i + 1
             mark_str = get_select_first_str(
                 response, '//*[@id="bqb"]/div[1]/div[2]/a[position()=' +
                 str(j) + "]/@title", None)
             mark = mark + mark_str.replace("表情包", "") + ','
         if mark != '':
             mark = mark[:-1]
     item = PictureItem()
     item['type'] = type
     item['title'] = title
     item['mark'] = mark
     item['thumbs_up_times'] = None
     item['crawl_origin'] = '发表情'
     item['crawl_url'] = response.url
     item['group_url'] = group_url
     item['pictures'] = pictures
     item['has_error'] = has_error
     print(concat_str('图片类型:', type))
     print(concat_str('图片标题:', title))
     print(concat_str('group_url:', group_url))
     print('picture_urls:')
     print('has_error:', has_error)
     print(pictures)
     yield item
Пример #9
0
 def content(self, response):
     type = 0
     if response.request.meta['type'] is None:
         type = get_source_type(response.url)
     else:
         type = response.request.meta['type']
     group_url = None
     if response.request.meta['group_url'] is None:
         group_url = None
     else:
         group_url = response.request.meta['group_url']
     picture_urls = []
     if response.request.meta['picture_urls'] is not None:
         picture_urls = response.request.meta['picture_urls']
     has_error = 'false'
     if response.request.meta['has_error'] is not None:
         has_error = response.request.meta['has_error']
     title = get_select_first_str(response, "/html/body/div[5]/div[1]/div[1]/h1/text()", None)
     if title != None:
         title = title.strip()
     mark = self.parse_mark(response)
     thumbs_up = get_select_first_str(response, "/html/body/div[5]/div[1]/div[4]/a[1]/i/text()", None)
     thumbs_up_times = None
     if thumbs_up == None:
         thumbs_up_times = None
     else:
         r = re.findall('[0-9]\d*', thumbs_up)
         if r != None and r.__len__() > 0:
             thumbs_up_times = int(r[0])
             if '万' in thumbs_up:
                 thumbs_up_times = thumbs_up_times * 10000
     images = response.xpath("//*[@id='txtabbox']/div[2]/ul/li")
     if images == None or len(images) == 0:
         images = response.xpath("/html/body/div[5]/div[1]/ul/li")
         if images == None or len(images) == 0:
             has_error = 'true'
         else:
             for i in range(len(images)):
                 j = i + 1
                 print(str(j))
                 head = "/html/body/div[5]/div[1]/ul/li[position()=" + str(j) + "]"
                 src = get_select_first_str(response, head + "/img/@data-original", None)
                 picture_urls.append(src)
     else:
         for i in range(len(images)):
             j = i + 1
             print(str(j))
             head = "//*[@id='txtabbox']/div[2]/ul/li[position()=" + str(j) + "]"
             src = get_select_first_str(response, head + "//a//img/@data-original", None)
             picture_urls.append(src)
     item = PictureItem()
     item['type'] = type
     item['title'] = title
     item['mark'] = mark
     item['thumbs_up_times'] = thumbs_up_times
     item['crawl_origin'] = '微茶'
     item['crawl_url'] = response.url
     item['group_url'] = group_url
     item['picture_urls'] = picture_urls
     item['has_error'] = has_error
     print(concat_str('图片类型:', type))
     print(concat_str('图片标题:', title))
     print(concat_str('图片标签:', mark))
     print(concat_str('点赞数量:', str(thumbs_up_times)))
     print(concat_str('group_url:', group_url))
     print('picture_urls:')
     print('has_error:', has_error)
     print(picture_urls)
     yield item
Пример #10
0
 def parse(self, response):
     elements = response.xpath(
         "//div[@class='y-wrap']//div[@class='y-box container']//div[@class='y-left index-content']//div[@riot-tag='feedBox']//div[@class='feedBox']//div[@riot-tag='wcommonFeed']//div[@class='wcommonFeed']//ul//li[@ga_event='article_item_click']"
     )
     e = elements[0]
     for i in range(len(elements)):
         item = NewsItem()
         j = i + 1
         print(str(j))
         item['type'] = get_news_type(response.url)
         head = "//div[@class='y-wrap']//div[@class='y-box container']//div[@class='y-left index-content']//div[@riot-tag='feedBox']//div[@class='feedBox']//div[@riot-tag='wcommonFeed']//div[@class='wcommonFeed']//ul//li[@ga_event='article_item_click' and position()=" + str(
             j) + "]"
         title = get_select_first_str(
             e, head +
             "//div[@class='item-inner y-box']//div[@class='rbox-inner']//div[@class='title-box']//a/text()",
             None)
         if title != None:
             item['title'] = title.strip()
         else:
             item['title'] = None
         media_url = get_select_first_str(
             e, head +
             "//div[@class='item-inner y-box']//a[@class='lbtn media-avatar']/@href",
             None)
         if media_url != None:
             media_url = 'https://www.toutiao.com' + media_url
             item['media_url'] = media_url
         else:
             item['media_url'] = None
         media_avatar_img = get_select_first_str(
             e, head +
             "//div[@class='item-inner y-box']//a[@class='lbtn media-avatar']//img/@src",
             None)
         if media_avatar_img != None:
             media_avatar_img = 'https:' + media_avatar_img
             item['media_avatar_img'] = media_avatar_img
         else:
             item['media_avatar_img'] = None
         media_name = get_select_first_str(
             e, head +
             "//div[@class='item-inner y-box']//a[@class='lbtn source']/text()",
             None)
         if media_name != None:
             media_name = media_name.replace('\xa0', '')
             media_name = media_name.replace('⋅', '')
         item['media_name'] = media_name
         comment = get_select_first_str(
             e, head +
             "//div[@class='item-inner y-box']//a[@class='lbtn comment']/text()",
             None)
         if comment == None:
             item['comment_count'] = None
         else:
             r = re.findall('[1-9]\d*', comment)
             if r != None and r.__len__() > 0:
                 comment_count = int(r[0])
                 if '万' in comment:
                     comment_count = comment_count * 10000
                 item['comment_count'] = comment_count
             else:
                 item['comment_count'] = None
         article_img = get_select_first_str(
             e, head +
             "//div[@class='item-inner y-box']//a[@class='img-wrap']//img/@src",
             None)
         if article_img != None:
             article_img = 'https:' + article_img
             item['article_img'] = article_img
         else:
             item['article_img'] = None
         article_url = get_select_first_str(
             e, head +
             "//div[@class='item-inner y-box']//a[@class='link title']/@href",
             None)
         if article_url != None:
             article_url = 'https://www.toutiao.com' + article_url
             item['article_url'] = article_url
         else:
             item['article_url'] = None
         item['mark'] = None
         item['crawl_origin'] = '今日头条'
         item['crawl_url'] = response.url
         print(concat_str('文章标题:', title))
         print(concat_str('源媒体:', media_url))
         print(concat_str('源媒体头像:', media_avatar_img))
         print(concat_str('源媒体名称:', media_name))
         print(concat_str('评论数:', comment))
         print(concat_str('文章图片:', article_img))
         print(concat_str('文章url:', article_url))
         yield item
Пример #11
0
 def parse(self, response):
     type = 0
     if response.request.meta['type'] is None:
         type = get_source_type(response.url)
     else:
         type = response.request.meta['type']
     group_url = None
     if response.request.meta['group_url'] is None:
         group_url = None
     else:
         group_url = response.request.meta['group_url']
     picture_urls = []
     if response.request.meta['picture_urls'] is not None:
         picture_urls = response.request.meta['picture_urls']
     has_error = 'false'
     if response.request.meta['has_error'] is not None:
         has_error = response.request.meta['has_error']
     title = get_select_first_str(
         response,
         "/html/body/div[3]/div[3]/div[1]/div[1]/div[1]/h1/text()", None)
     if title != None:
         title = title.strip()
     images = response.xpath("//*[@id='content']/ul/center/img")
     if images == None or len(images) == 0:
         images = response.xpath("//*[@id='content']/ul/center//img")
         if images == None or len(images) == 0:
             has_error = 'true'
         else:
             divs = response.xpath("//*[@id='content']/ul/center//div")
             for d in range(len(divs)):
                 images = response.xpath(
                     "//*[@id='content']/ul/center//div[position()=" +
                     str(d) + "]//img")
                 for i in range(len(images)):
                     j = i + 1
                     print(str(j))
                     head = "//*[@id='content']/ul/center//div[position()=" + str(
                         d) + "]//img[position()=" + str(j) + "]"
                     src = get_select_first_str(response, head + "/@src",
                                                None)
                     picture_urls.append(src)
     else:
         for i in range(len(images)):
             j = i + 1
             print(str(j))
             head = "//*[@id='content']/ul/center/img[position()=" + str(
                 j) + "]"
             src = get_select_first_str(response, head + "/@src", None)
             picture_urls.append(src)
     item = PictureItem()
     item['type'] = type
     item['title'] = title
     item['mark'] = None
     item['thumbs_up_times'] = None
     item['crawl_origin'] = '美头网'
     item['crawl_url'] = response.url
     item['group_url'] = group_url
     item['picture_urls'] = picture_urls
     item['has_error'] = has_error
     print(concat_str('图片类型:', type))
     print(concat_str('图片标题:', title))
     print(concat_str('group_url:', group_url))
     print('picture_urls:')
     print('has_error:', has_error)
     print(picture_urls)
     yield item
Пример #12
0
 def content(self, response):
     item = JokeItem()
     item['type'] = '1'
     head = "//div[@class='th']//div[@class='t2']//div[@class='main']//div[@class='news_info']"
     title = get_select_first_str(
         response, head +
         "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@class='head_title_2']//span//a/@title",
         None)
     if title != None:
         item['title'] = title.strip()
     else:
         item['title'] = None
     mark = None
     index = 1
     while True:
         r = get_select_first_str(
             response, head +
             "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@class='publish_info']//span//a[position()="
             + str(index) + "]/text()", None)
         if r == None:
             break
         index = index + 1
         if mark == None:
             mark = r
         else:
             mark = mark + ',' + r
     item['mark'] = mark
     media_url = get_select_first_str(
         response, head +
         "//div[@class='main_info_top']//div[@class='main_info_top_left']//div[@id='head_photo']//a/@href",
         None)
     if media_url != None:
         media_url = 'http://www.xiaohuabus.com' + media_url
         item['media_url'] = media_url
     else:
         item['media_url'] = None
     media_avatar_img = get_select_first_str(
         response, head +
         "//div[@class='main_info_top']//div[@class='main_info_top_left']//div[@id='head_photo']//a//img/@src",
         None)
     if media_avatar_img != None:
         media_avatar_img = 'http://www.xiaohuabus.com' + media_avatar_img
         item['media_avatar_img'] = media_avatar_img
     else:
         item['media_avatar_img'] = None
     media_name = get_select_first_str(
         response, head +
         "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@id='head_title']//div[@class='user_info']//span[@id='yonghuming']//a/text()",
         None)
     if media_name != None:
         media_name = media_name.replace('\xa0', '')
         media_name = media_name.replace('⋅', '')
     item['media_name'] = media_name
     thumbs_up = get_select_first_str(
         response, head +
         "//div[@class='feix']//div[@class='feix_right']//a[position()=1]//span[position()=2]/text()",
         None)
     thumbs_up_times = None
     if thumbs_up == None:
         item['thumbs_up_times'] = None
     else:
         r = re.findall('[1-9]\d*', thumbs_up)
         if r != None and r.__len__() > 0:
             thumbs_up_times = int(r[0])
             if '万' in thumbs_up:
                 thumbs_up_times = thumbs_up_times * 10000
             item['thumbs_up_times'] = thumbs_up_times
         else:
             item['thumbs_up_times'] = None
     text = get_select_first_str(response,
                                 head + "//div[@class='main_info_bottom']",
                                 None)
     item['text'] = text
     item['crawl_origin'] = '笑话巴士'
     item['crawl_url'] = response.url
     print(concat_str('图片标题:', title))
     print(concat_str('源媒体:', media_url))
     print(concat_str('源媒体头像:', media_avatar_img))
     print(concat_str('源媒体名称:', media_name))
     print(concat_str('获赞数:', str(thumbs_up_times)))
     print(concat_str('标签:', mark))
     print(concat_str('内容', text))
     return item
Пример #13
0
 def parse(self, response):
     elements = response.xpath(
         "//div[@class='th']//div[@class='main']//div[@class='main_info']")
     if len(elements) <= 0:
         return
     e = elements[0]
     for i in range(len(elements)):
         item = TextItem()
         j = i + 1
         print(str(j))
         #item['type'] = get_news_type(response.url)
         head = "//div[@class='th']//div[@class='main']//div[@class='main_info' and position()=" + str(
             j) + "]"
         title = get_select_first_str(
             e, head +
             "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@class='head_title_2']//span//a//h1/text()",
             None)
         if title != None:
             item['title'] = title.strip()
         else:
             item['title'] = None
         mark = None
         index = 1
         while True:
             r = get_select_first_str(
                 e, head +
                 "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@id='head_title']//div[@class='publish_info']//span//a[position()="
                 + str(index) + "]/text()", None)
             if r == None:
                 break
             index = index + 1
             if mark == None:
                 mark = r
             else:
                 mark = mark + ',' + r
         item['mark'] = mark
         media_url = get_select_first_str(
             e, head +
             "//div[@class='main_info_top']//div[@class='main_info_top_left']//div[@id='head_photo']//a/@href",
             None)
         if media_url != None:
             media_url = 'http://www.xiaohuabus.com' + media_url
             item['media_url'] = media_url
         else:
             item['media_url'] = None
         media_avatar_img = get_select_first_str(
             e, head +
             "//div[@class='main_info_top']//div[@class='main_info_top_left']//div[@id='head_photo']//a//img/@src",
             None)
         if media_avatar_img != None:
             media_avatar_img = 'http://www.xiaohuabus.com' + media_avatar_img
             item['media_avatar_img'] = media_avatar_img
         else:
             item['media_avatar_img'] = None
         media_name = get_select_first_str(
             e, head +
             "//div[@class='main_info_top']//div[@id='main_info_top_right']//div[@id='head_title']//div[@class='user_info']//span[@id='yonghuming']//a/text()",
             None)
         if media_name != None:
             media_name = media_name.replace('\xa0', '')
             media_name = media_name.replace('⋅', '')
         item['media_name'] = media_name
         thumbs_up = get_select_first_str(
             e, head +
             "//div[@class='feix']//div[@class='feix_right']//a[position()=1]//span[position()=2]/text()",
             None)
         thumbs_up_times = None
         if thumbs_up == None:
             item['thumbs_up_times'] = None
         else:
             r = re.findall('[1-9]\d*', thumbs_up)
             if r != None and r.__len__() > 0:
                 thumbs_up_times = int(r[0])
                 if '万' in thumbs_up:
                     thumbs_up_times = thumbs_up_times * 10000
                 item['thumbs_up_times'] = thumbs_up_times
             else:
                 item['thumbs_up_times'] = None
         text = get_select_first_str(
             e, head + "//div[@class='main_info_bottom']", None)
         item['text'] = text
         item['crawl_origin'] = '笑话巴士'
         item['crawl_url'] = response.url
         print(concat_str('图片标题:', title))
         print(concat_str('源媒体:', media_url))
         print(concat_str('源媒体头像:', media_avatar_img))
         print(concat_str('源媒体名称:', media_name))
         print(concat_str('获赞数:', str(thumbs_up_times)))
         print(concat_str('标签:', mark))
         print(concat_str('文本内容:', text))
         yield item
     next_url = get_select_first_str(
         response,
         "//div[@class='th']//div[@class='main']//div[@class='pager']//a[@class='page' and text()='下一页']/@href",
         None)
     if next_url is not None:
         next_url = "http:" + next_url
         yield response.follow(next_url, callback=self.parse)