Пример #1
0
    def parse(self, response):

        contentLeft = response.xpath(
            "//div[@id='content-left']/div"
        )  # 获得的数据是 SelectorList 类型,实际上也是继承 Selector
        # items = []
        for duanzidiv in contentLeft:
            # print(duanzidiv)
            author = duanzidiv.xpath(".//h2/text()").get().strip()
            content = duanzidiv.xpath(
                ".//div[@class='content']//text()").getall()
            content = "".join(content).strip()
            # duanzi = {'author':author, 'content':content}
            item = QsbkItem(author=author, content=content)  # 不使用字典返回,返回item
            yield item
            # items.append(item)
        # return items
        next_url = response.xpath(
            "//ul[@class='pagination']/li[last()]/a/@href").get()
        print('网页' + next_url)
        print("=" * 50)
        if not next_url:
            return
        else:
            # 发送另外一个请求 callback 代表请求回来后,执行self.parse()的函数
            yield scrapy.Request(self.base_domain + next_url,
                                 callback=self.parse)
Пример #2
0
 def parse(self, response):
     # 返回结果为SelectorList
     duanzidivs = response.xpath("//div[@id='content-left']/div")
     items = []
     for duanzidiv in duanzidivs:
         # 循环每一个Selector,调用get()返回unicode字符串
         author = duanzidiv.xpath(".//h2/text()").get().strip()
         # 获取所有内容getall,返回一个列表
         content = duanzidiv.xpath(
             ".//div[@class='content']//text()").getall()
         # "".join把列表转换为字符串
         content = "".join(content).strip()
         # 约束我们传递的参数数量
         item = QsbkItem(author=author, content=content)
         # 第一种方式:转换为生成器
         yield item
         # 第二种方式定义列表,return列表
     #     items.append(item)
     # return items
     next_url = response.xpath(
         "//ul[@class='pagination']/li[last()]/a/@href").get()
     if not next_url:
         return
     else:
         # 携带当前参数(下一页的url),回调当前请求并再次解析
         yield scrapy.Request(self.base_domain + next_url,
                              callback=self.parse)
Пример #3
0
 def parse(self, response):
     content_letf = response.xpath('//div[@id="content-left"]/div')
     for duanzidiv in content_letf:
         # 判断是否存在“阅读全文”按钮
         content_for_all = duanzidiv.xpath(
             './/span[@class="contentForAll"]')
         if content_for_all:
             content_url = duanzidiv.xpath(
                 './/a[@class="contentHerf"]/@href').get()
             # 请求详情页,获得完整的段子
             yield scrapy.Request(self.base_domain + content_url,
                                  callback=self.parse_content)
             continue
         # 作者
         author = duanzidiv.xpath('.//h2/text()').get().strip()
         # 好笑值
         stats_vote = duanzidiv.xpath(
             './/span[@class="stats-vote"]/i/text()').get()
         # 内容
         contents = duanzidiv.xpath(
             './/div[@class="content"]//span/text()').getall()
         content = "".join(contents).strip().replace('\"', ' ')
         item = QsbkItem(author=author,
                         stats_vote=stats_vote,
                         content=content)
         yield item
     next_url = response.xpath(
         '//ul[@class="pagination"]/li[last()]/a/@href').get()
     if not next_url:
         return
     else:
         # 请求下一页
         yield scrapy.Request(self.base_domain + next_url,
                              callback=self.parse)
Пример #4
0
    def parse(self, response):
        """
        提取数据
        :param response: 下载完成数据;类型为HtmlResponse
        """
        # response.xpath()  提取出来的数据类型为SelectorList
        liList = response.xpath('//div[@class="j-r-list"]/ul/li')
        for li in liList:
            # Selector: get()将Selector转换为uncode
            author = li.xpath('.//div[@class="u-txt"]/a/text()').get()
            content = li.xpath(
                './/div[@class="j-r-list-c"]/div[@class="j-r-list-c-desc"]/a/text()'
            ).getall()
            content = "".join(content).strip()

            # 规范,固定传递指定参数
            item = QsbkItem(author=author, content=content)
            yield item  # ==> itmes.append(item)
        '''
        爬取下一页:获取"下一页"的链接,再次回调parse方法。直到无"下一页"链接则return方法
        '''
        next_url = response.xpath(
            '//div[@class="j-page"]//a[@class="pagenxt"]/@href').get()
        if not next_url:
            return
        elif int(next_url) == 51:
            # 51页无数据
            print("最后一页啦  %s" % next_url)
            return
        else:
            # 返回当前请求给parse();不能用return(会停止执行parse方法)
            yield scrapy.Request(self.base_domains + next_url,
                                 callback=self.parse)
Пример #5
0
    def parse(self, response):

        base_url = "https://www.qiushibaike.com"

        # SelectorList
        duanziDivs = response.xpath("//div[@id='content-left']/div")

        # 遍历SelectorList得到Selector
        for duanziDiv in duanziDivs:
            # extract()得到符合条件的所有元素提取html代码,等价于getall(),返回列表
            # extract_first()得到符合条件的第一个元素并提取其html代码,等价于get()
            author = duanziDiv.xpath(".//h2/text()").get().strip()
            content = duanziDiv.xpath(
                ".//div[@class='content']//text()").getall()
            content = "".join(content).strip()

            item = QsbkItem(author=author, content=content)

            yield item

        next_url = response.xpath(
            "//ul[@class='pagination']/li[last()]/a/@href").get()

        if not next_url:
            return
        else:
            next_url = base_url + next_url
            yield scrapy.Request(next_url, callback=self.parse)
Пример #6
0
 def parse(self, response):
     duanzidivs = response.xpath('//div[@id="content-left"]/div')
     for duanzi in duanzidivs:
         author = duanzi.xpath(".//h2/text()").get().strip()
         content = duanzi.xpath(".//div[@class='content']//text()").getall()
         content = "".join(content).strip()
         item = QsbkItem(author=author, content=content)
         yield item
Пример #7
0
 def parse(self, response):
     i = QsbkItem()
     i['content'] = response.xpath(
         '//div[@class="content"]/span/text()').extract()
     # i['link'] = response.xpath('//div[@id="qiushi_tag_120320318"]/@href').extract()
     # i['data'] = response.xpath('//i[@class="number"]').extract()
     print(i['content'])
     return i
Пример #8
0
 def parse(self, response):
     coon = response.xpath('//div[@id="footzoon"]')
     contents = coon.xpath('.//div[@id="endtext"]/text()').getall()
     for content in contents:
         content = content.strip()
         item = QsbkItem(content=content)
         print(content)
         yield item
Пример #9
0
 def parse_item(self, response):
     i = QsbkItem()
     selector=Selector(response)
     i['content'] = selector.xpath('//*[@id="single-next-link"]/div/text()').extract()
     # i['link'] = response.xpath('//div[@id="qiushi_tag_120320318"]/@href').extract()
     # i['data'] = response.xpath('//i[@class="number"]').extract()
     print(i['content'])
     return i
     time.sleep(10)
Пример #10
0
 def parse(self, response):
     print('*' * 40)
     # node_list = response.xpath("//div[@id='big_pic']")
     node_list = response.xpath("//a[@href='/weimei/48095.html']")
     for node in node_list:
         item = QsbkItem()
         name = node.xpath("img/@src").extract()
         item['url'] = name[0]
         yield item
     print('*' * 40)
Пример #11
0
 def parse(self, response):
     duanzidiv = response.xpath("//div[@id='content-left']/div")
     for dz in duanzidiv:
         author = dz.xpath(".//h2/text()").get().strip()
         content = dz.xpath(".//div[@class='content']//text()").getall()
         content = ''.join(content).strip()
         # print(author)
         # print(content)
         # duanzi2_ = {'author': author, 'content': content}
         item = QsbkItem(author=author, content=content) # 可以这样简化写
         yield item # 将数据移交给 piplines
Пример #12
0
 def parse_content(self, response):
     author = response.xpath(
         './/div[@class="detail-col0"]/a/img/@alt').get()
     stats_vote = response.xpath(
         './/span[@class="stats-vote"]/i/text()').get()
     content_page = response.xpath('.//div[@class="col1 new-style-col1"]')
     contents = content_page.xpath(
         './/div[@class="content"]/text()').getall()
     content = "".join(contents).strip().replace('\"', ' ')
     item = QsbkItem(author=author, stats_vote=stats_vote, content=content)
     yield item
Пример #13
0
    def parse(self, response):
        duanzi_div = response.xpath('//div[@id="content-left"]/div')
        for duanzi in duanzi_div:
            author = duanzi.xpath('.//h2/text()').get().strip()
            content = duanzi.xpath('.//div[@class="content"]//text()').getall()
            content = ''.join(content).strip()
            item = QsbkItem(author=author, content=content)
            yield item

        next_url = response.xpath(
            '//ul[@class="pagination"]/li[last()]/a/@href').get()
        if next_url:
            yield scrapy.Request(self.base_url + next_url, self.parse)
Пример #14
0
 def parse(self, response):
     duanzidiv = response.xpath("//div[@id = 'content-left']/div")
     for duanzi in duanzidiv:
         author = duanzi.xpath(".//h2/text()").get().strip()
         content = duanzi.xpath(".//div[@class= 'content']//text()").getall()
         content = ''.join(content)
         item = QsbkItem(author=author, content=content)
         yield item
     next_url = response.xpath("//ul[@class =[pagination']/li[last()]/a/@href").get()
     if not next_url:
         return
     else:
         yield scrapy.Request(self.base_domain+next_url, callback=self.parse)
Пример #15
0
 def parse(self, response):
     duanzidivs = response.xpath('//div[@id="content-left"]/div')
     for duanzidiv in duanzidivs:
         author = duanzidiv.xpath(".//h2/text()").get().strip()
         content = duanzidiv.xpath('.//div[@class="content"]//text()'
                                   ).getall()  # extract = getall
         content = ''.join(content).strip()
         #方法一:以字典的方式传回,不好
         # duanzi = {'author':author,'content':content}
         # yield duanzi
         #方法二:使用items
         item = QsbkItem(author=author, content=content)
         yield item
Пример #16
0
 def parse2(self,response):
     try:
         text = json.loads(response.text[20:-1])
         comment_lists = text['result']['cmntlist']
         # print(comment_lists)
         for comment in comment_lists:
             item = QsbkItem()
             item['mid'] = comment['mid']
             item['content'] = comment['content']
             item['nick'] = comment['nick']
             # print(item)
             yield item
     except:
         pass
Пример #17
0
 def parse(self, response):
     # SelectorList
     # 解析页面
     content_left = response.xpath('//div[@id="content-left"]/div')
     # 提取数据
     for dz_div in content_left:
         # Selector
         author = dz_div.xpath(".//h2/text()").get().strip()
         content_tmp = dz_div.xpath(
             ".//div[@class='content']//text()").getall()
         content = ''.join(content_tmp).strip()
         item = QsbkItem(author=author, content=content)
         # 使用yield返回给pipeline
         yield item
Пример #18
0
 def parse(self, response):
     contents = response.xpath("//div[@class='content']")
     for content in contents:
         text = content.xpath("text()").get().strip()
         if text != "":
             item = QsbkItem(content=text)
             yield item
     
     print("page {} finished...".format(response.url))
     page_next = response.xpath("//div[@class='pagebar']/a[text()='下一页']/@href").get()
     if not page_next:
         return
     else:
         page_next_url = self.url_head + page_next
         yield scrapy.Request(page_next_url,callback=self.parse)
Пример #19
0
    def parse(self, response):
        # selectorList 类型
        duanziDivs = response.xpath("//div[@class='col1 old-style-col1']/div")

        i = 0

        for div in duanziDivs:
            # div: Selector类型
            # div.xpath():selectorList类型
            author = div.xpath(".//h2/text()").get().strip()
            # get()函数,取到第一个结果(str类型), get()<=>extract_first()
            # extract(): 提取所有结果组成列表,每个元素是str类型
            # strip(): 去掉前后的空格

            href = div.xpath('./a/@href').get() # 获取a标签内的href的属性值
            url = urljoin(self.start_urls[0],href) # 跳转到详情页

            content = div.xpath(".//div[@class='content']//text()").extract()
            # html中该div下有span标签,span中才是内容,//text()直接提取该内容

            content = ''.join(content).strip()

            # tplt = "{0:{2}<20}\t{1:^50}"
            # print(tplt.format(author,url,chr(12288)))
            # print(content)
            # print('=' * 100)

            # duanzi = {'author':author,'content':content} # 即为一个个的存储项,item,写法如下 ⬇
            item = QsbkItem(author=author, content=content)

            i += 1
            print('生成器调用:',i)

            # yield duanzi
            yield item

            # 若不使用生成器,即注释上面的yield item,可使用以下方法
            # items = []
            # items.append(item)
            # return items
            # 返回所有的items=,在pipelines中也可以被解析
        next_path = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() # 底部显示页面的最后一个li标签,‘下一页’

        if not next_path: # 如果没有下一页
            return
        else:
            next_url = urljoin(self.start_urls[0], next_path)
            yield scrapy.Request(next_url, callback=self.parse) # 利用当前页的url,执行上面的parse解析函数
Пример #20
0
 def parse(self, response):
     duanzidivs = response.xpath('//div[@class="col1 old-style-col1"]/div')
     for duanzidiv in duanzidivs:
         author = duanzidiv.xpath('.//h2/text()').get().strip()
         content = duanzidiv.xpath(
             './/div[@class="content"]//text()').getall()
         content = ''.join(content).strip()
         item = QsbkItem(author=author, content=content)
         yield item
         nextUrl = duanzidiv.xpath(
             '//ul[@class="pagination"]/li[last()]/a/@href').get()
         if not nextUrl:
             return
         else:
             yield scrapy.Request(self.baseDomain + nextUrl,
                                  callback=self.parse)
Пример #21
0
 def parse(self, response):
     # SelectorList
     duanzidivs = response.xpath("//div[@id = 'content-left']/div")
     for duanzidiv in duanzidivs:
         # Selector
         author = duanzidiv.xpath(".//h2/text()").get().strip()
         content = duanzidiv.xpath(".//span/text()").getall()
         content = "".join(content).strip()
         item = QsbkItem(author=author, content=content)
         # duanzi = {"author":author,"content":content}
         yield item
     next_url = response.xpath("//ul[@class= 'pagination']/li[last()]/a/@href").get()
     if not next_url:
         return
     else:
         yield scrapy.Request(self.base_domin + next_url, callback=self.parse)
Пример #22
0
 def parse(self, response):
     duanzidivs = response.xpath('//div[@id="content-left"]/div')  # SelectorList类型
     for duanzidiv in duanzidivs:
         # duanzidiv是Selector类型
         author = duanzidiv.xpath(".//h2/text()").get().strip()
         content = duanzidiv.xpath(".//div[@class='content']//text()").getall()
         content = ''.join(content).strip()
         # duanzi = {'author': author, 'content': content}
         item = QsbkItem(author=author, content=content)
         yield item
     next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
     if not next_url:
         return
     else:
         # yield scrapy.Request(self.base_domains+next_url, callback=self.parse)
         yield response.follow(next_url, callback=self.parse)
Пример #23
0
 def parse(self, response):
     duanzidivs = response.xpath(".//div[@id='content-left']/div")
     for duanzidiv in duanzidivs:
         author = duanzidiv.xpath(".//h2//text()").get().strip()
         content = duanzidiv.xpath(
             ".//div[@class='content']//text()").getall()
         content = "".join(content).strip()
         item = QsbkItem(author=author, content=content)
         yield item
     next_url = response.xpath(
         '//*[@id="content-left"]/ul/li[last()]/a/@href').get()
     if not next_url:
         return
     else:
         yield scrapy.Request(self.base_domain + next_url,
                              callback=self.parse)
Пример #24
0
 def parse(self, response):
    # items = []
     outerbox = response.xpath("//div[@id='content-left']/div")
     for box in outerbox:
          author = box.xpath(".//div[contains(@class,'author')]//h2/text()").extract_first().strip()
          content = box.xpath(".//div[@class='content']/span/text()").extract_first().strip()
          item = QsbkItem(author=author,content=content)
         # item["author"] = author
         # item["content"] = content
          yield  item
         # items.append(item)
     next_url=response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
     if not next_url:
         return
     else :
         yield  scrapy.Request('https://www.qiushibaike.com'+next_url,callback=self.parse)
     #return items
Пример #25
0
 def parse(self, response):
     # 返回SelectorList
     article_divs = response.xpath('//div[@id="content-left"]/div')
     for article_div in article_divs:
         # 每个article_div的数据类型是Selector
         author = article_div.xpath(
             './/div[@class="author clearfix"]//h2/text()').get()
         # 获取所有指定div下面的文本
         content = article_div.xpath(
             './/div[@class="content"]//text()').getall()
         # 把文本连起来(join)并去空格
         content = "".join(content).strip()
         # 使用items.py中定义的字段来接收
         item = QsbkItem()
         item['author'] = author
         item['content'] = content
         yield item
Пример #26
0
    def parse(self, response):

        descs = response.xpath("//div[@class='j-r-list-c-desc']")
        for desc in descs:
            jokes = desc.xpath(".//text()").getall()
            joke = "\n".join(jokes).strip()
            link = desc.xpath(".//a/@href").get()
            links = str(self.base_domain + link)
            # link = self.base_domain+links
            item = QsbkItem(joke=joke, links=links)
            yield item
        next_url = response.xpath("//a[@class='pagenxt']/@href").get()
        if not next_url:
            return
        else:
            yield scrapy.Request(self.base_domain + "/text/" + next_url,
                                 callback=self.parse)
Пример #27
0
    def parse(self, response):
        divs = response.xpath('//div[@id="content-left"]/div')
        for div in divs:
            author = div.xpath('.//h2/text()').get().strip()
            content = div.xpath(
                './/div[@class="content"]/span/text()').getall()
            content = "".join(content).strip()
            item = QsbkItem(author=author, content=content)
            yield item

        href = response.xpath(
            '//ul[@class="pagination"]/li[last()]/a/@href').get()
        if not href:
            return
        else:
            nex_url = self.domain_url + href
            yield scrapy.Request(nex_url, callback=self.parse)
Пример #28
0
    def parse(self, response):
        duanziDivs = contentLeft = response.xpath("//div[@id='content-left']/div")
        for duanzidiv in duanziDivs:
            author = duanzidiv.xpath(".//h2/text()").get().strip()
            content = duanzidiv.xpath(".//div[@class='content']//text()").getall()
            content = "".join(content).strip()
            
            # duanzi = {"author":author,"content":content}
            # yield duanzi

            item = QsbkItem(author=author,content=content)
            yield item
        next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
        if not next_url:
            return
        else:
            yield scrapy.Request(self.base_domain + next_url, self.parse)
Пример #29
0
 def parse(self, response):
     duanzidivs = response.xpath(r"//div[@class='col1 old-style-col1']/div")
     for duanzidiv in duanzidivs:
         author = duanzidiv.xpath(r".//a[2]/h2/text()").get()
         author = "".join(author).strip()  #去掉空格
         content = duanzidiv.xpath(
             r".//div[@class='content']/span/text()").getall()
         content = "".join(content).strip()  #去掉空格
         item = QsbkItem(author=author, content=content)
         yield item
     next_url = response.xpath(
         r"//ul[@class='pagination']/li[last()]/a/@href").get()
     if not next_url:
         return
     else:
         yield scrapy.Request(self.base_domain + next_url,
                              callback=self.parse)
Пример #30
0
    def parse(self, response):
        contentDivs = response.xpath("//div[@id='j_p_postlist']/div")
        for contentDiv in contentDivs:
            author = contentDiv.xpath('.//li[@class="d_name"]/a/text()').get()
            content = contentDiv.xpath('.//cc/div[2]/text()').get().strip()
            # print(author, content)
            # tieba = {'author': author, 'content' : content}
            item = QsbkItem(author=author, content=content)
            yield item

        next_url = response.xpath(
            '//ul[@class="l_posts_num"]/li/a[last()-1]/@href').get()

        if not next_url:
            return
        else:
            yield scrapy.Request(self.base_domain + next_url, self.parse)