예제 #1
0
    def parse(self, response):
        """
        得到解析的页面数据
        :param response:
        :return:
        """
        # 调用xpath解析
        div_list = response.xpath('//div[@id="content-left"]/div')
        # 存储解析到的页面数据到列表中
        data_list = []
        for div in div_list:
            # xpath 解析到的指定内容被存储到Selector对象中
            # extract() 方法可以将Selector对象中存储的数据值拿到
            # title = div.xpath('./div/a[2]/h2/text()').extract()[0]
            title = div.xpath('./div/a[2]/h2/text()').extract_first()
            content = div.xpath(
                './/div[@class="content"]/span/text()').extract()[0]

            # print(title,content)
            dict = {'author': title, 'content': content}
            data_list.append(dict)

            # 1.将解析到的数据值( author 和 content ) 存储到 items 对象中
            item = QiubaiproItem()
            item['author'] = title
            item['content'] = content
            print(content)
            # 2.将item 对象提交给管道
            yield item
예제 #2
0
 def parse(self, response):
     div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
     for div in div_list:
         author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
         content = div.xpath('./a[1]/div/span//text()').extract()
         content = ''.join(content)
         item = QiubaiproItem()
         item['author'] = author
         item['content'] = content
         yield item  #将item提交给了管道
예제 #3
0
    def parse(self, response):
        li_list = response.xpath('//*[@id="content"]/div/div[2]/div/ul/li')
        # print(li_list)
        # all_data = []
        for li in li_list:
            author = li.xpath('./div/div/a/span/text()').extract_first()
            content = li.xpath('./div/a/text()').extract_first()

            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content

            yield item
예제 #4
0
    def parse(self, response):
        div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
        # 存储所有解析到的数据
        all_data = []
        for div in div_list:
            author = div.xpath('./div[1]/a[1]/img/@alt').extract()[0]
            content = div.xpath('./a[1]/div/span//text()').extract()
            content = ''.join(content)

            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content
            # 将item提交给管道
            yield item
예제 #5
0
    def parse(self, response):
        div_list = response.xpath('//*[@id="content"]/div/div[2]/div')
        for div in div_list:
            # xpath返回的是列表,但是列表元素一定是Selector类型的对象
            # extract可以将Selector对象中data参数存储的字符串提取出来
            author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
            # 列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取了出来,返回一个列表
            content = div.xpath('./a/div/span//text()').extract()
            content = ''.join(content)

            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content
            # 将item提交给管道
            yield item
예제 #6
0
 def parse(self, response):
     div_list = response.xpath('//*[@id="content-left"]/div')
     for div in div_list:
         # xpath返回的是列表,但是列表元素一定是Selector类型的对象
         # extract可以将Selector对象中存储在data中的字符串提取出来
         # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
         author = div.xpath(
             './div[1]/a[2]/h2/text() | ./div[1]/span/h2/text()'
         ).extract_first().strip()
         content = div.xpath('./a/div/span[1]//text()').extract()
         content = ''.join(content)
         content = content.strip()
         item = QiubaiproItem()
         item['author'] = author
         item['content'] = content
         yield item
예제 #7
0
파일: qiubai.py 프로젝트: slTrust/reptile
    def parse(self, response):
        # 推荐xpath(scrapy里集成了xpath解析接口)

        # 每个段子的外层容器
        div_list = response.xpath('//div[@id="content-left"]/div')

        for div in div_list:
            author = div.xpath('./div/a[2]/h2/text()').extract_first()
            content = div.xpath('.//div[@class="content"]/span/text()').extract_first()

            # step001 将解析到的数据 存到items对象
            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content

            # step002 将item对象提交给管道
            yield item
예제 #8
0
    def parse(self, response):
        # 解析:作者的名称 + 段子内容
        div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
        all_data = []  # 存储所有解析到的数据
        for div in div_list:
            # xpath返回的是列表,但是列表元素一定是Selector类型的对象
            # extract()可以将Selector对象中data参数存储的字符串提取出来
            # author_name = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
            author = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()  # 如果列表中只有一个元素可以用该方法
            # 如果列表调用了extract之后,则表示将列表中每一个Selector对象data对应的字符串提取出来
            content = div.xpath('./a[1]/div/span//text()').extract()
            content = ''.join(content)

            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content

            yield item  # 将item提交给了管道
예제 #9
0
    def parse(self, response):
        # 解析作者的名称+段子内容
        div_list = response.xpath('//div[@id="content-left"]/div')
        all_data = []
        for div in div_list:
            # 返回为列表,列表元素为selector类型的对象
            # extract可以将selector对象中data参数存储字符串提取出来
            author = div.xpath(
                './div[1]/a[2]/h2/text() | ./div[1]/span/h2/text()'
            ).extract_first()
            content = div.xpath('./a[1]/div/span//text()').extract()
            content = ''.join(content)

            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content

            # 将item提交给管道
            yield item
예제 #10
0
    def parse(self, response):
        # 解析:作者名称 + 段子内容、
        div_list = response.xpath('//*[@id="content"]/div/div[2]/div')
        all_data = []  # 存储所有解析到的数据

        for div in div_list:
            # xpath返回的是列表,但是列表元素一定是Selector类型的对象
            # extract可以将Selector对象中data参数存储的字符串提取出来
            # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
            author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
            # 列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取了出来
            content = div.xpath('./a[1]/div/span//text()').extract()
            content = ''.join(content)

            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content

            yield item  # 将item提交给管道
예제 #11
0
    def parse(self, response):
        #解析作者的名称和段子的内容(全部)
        div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
        for div in div_list:
            # extract() 可以将selector 中参数data存储的字符串提取出来
            # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
            author = div.xpath(
                './div[1]/a[2]/h2/text() | ./div[1]/span/h2/text()'
            ).extract_first()
            # 列表调用 extract()之后,表示将selector中的每一个data中的字符串提取出来
            text = div.xpath('./a[1]/div/span//text()').extract()
            text = "".join(text)  #将列表转为字符串

            item = QiubaiproItem()  #实例化一个item对象
            #封装到item中
            item['author'] = author
            item['text'] = text

            yield item  #将item提交给管道
예제 #12
0
    def parse(self, response):
        # 解析:作者的名称+段子的内容
        div_list = response.xpath('//*[@id="content"]/div/div[2]/div')
        # print(div_list)

        for div in div_list:
            # xpath返回的是列表,但是列表元素一定是Selector类型的对象
            # extract可以将Selector对象中data参数存储的字符串提取出来
            author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
            # 列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取出来
            content1 = div.xpath('./a[1]/div/span/text()').extract()
            # 将content中的元素连接成一个字符串
            content2 = ''.join(content1)

            #将解析的数据封装存储到item类型的对象中
            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content2

            #将item类型的对象提交给管道进行持久化存储的操作
            yield item
예제 #13
0
    def parse(self, response):
        div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
        all_data = []
        for div in div_list:
            #extract()方法提取selector对象的数据
            author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
            if author == None:
                author = "匿名用户"
            content = div.xpath('./a[1]/div/span[1]//text()').extract()
            content = ''.join(content)
            item = QiubaiproItem()
            #item.Field()方法基本上继承了字典,所以用字典的访问方式
            item['author'] = author
            item['content'] = content
            yield item

            if self.page_num <= 11:
                new_url = self.url + str(self.page_num)
                self.page_num += 1
                #手动请求发送:callback回调函数是专门用于作数据解析
                yield scrapy.Request(url=new_url, callback=self.parse)
예제 #14
0
    def parse(self, response):
        divs = response.xpath(
            '//div[contains(@class,"col1 old-style-col1")]/div')
        for div in divs:
            author = div.xpath(
                './div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()')[0]
            content = div.xpath('./a[1]/div/span//text()')  # 有其他子标签用//text()!!

            # extract or extract_first
            #print(author.extract().strip()) # 将selector中的data对象提取
            #print("".join(content.extract()).replace(" ","").strip())
            #print('======')

            ##############################
            # 基于管道, 指令的方式见资料代码
            ##############################
            item = QiubaiproItem()
            item['author'] = author.extract().strip()
            item['content'] = "".join(content.extract()).replace(" ",
                                                                 "").strip()
            yield item
예제 #15
0
    def parse(self, response):
        div_list = response.xpath('//div[@id="content-left"]/div')
        for div in div_list:
            #数据指纹:爬取到一条数据的唯一标识
            author = div.xpath('./div/a[2]/h2/text() | ./div/span[2]/h2/text()'
                               ).extract_first()
            content = div.xpath('./a/div/span//text()').extract()
            content = ''.join(content)

            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content

            #数据指纹的创建
            data = author + content
            hash_key = hashlib.sha256(data.encode()).hexdigest()
            ex = self.conn.sadd('hash_keys', hash_key)
            if ex == 1:
                print('有新数据更新......')
                yield item
            else:
                print('无数据更新!')
예제 #16
0
    def parse(self, response):
        content = response.xpath('//div[@id="content"]')
        content_left = content.xpath('./div[1]/div[2]')
        div_list = content_left.xpath('./div')

        # 解析作者名称 段子内容
        for div in div_list:
            # xpath返回的是列表,但元素一定是selector类型
            # extract可以将select对象中data提取
            # author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
            author = div.xpath(
                './div[1]/a[2]/h2/text() | ./div[1]/span/h2/text()'
            )[0].extract()
            # content = div.xpath('./a[1]/div/span//text()')[0].extract()
            # 将每一个列表元素的data提取
            content = div.xpath('./a[1]/div/span//text()').extract()
            content = ''.join(content)
            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content

            yield item
예제 #17
0
    def parse(self, response):
        # 解析:作者名称和段子的内容
        div_list = response.xpath(
            '//*[@class="content-block clearfix"]/div[2]/div')

        for div in div_list:
            # xpath返回的是列表,但是列表元素一定是selector类型对象
            # extract()可以将selector对象中data参数存储的字符串提取出来
            author = div.xpath('./div[1]/a[2]/h2/text()').extract()
            author = ''.join(author)
            #author = author.strip('\n')
            # 列表调用了.extract()之后,则表示将列表中每一个selector对象中data对应的字符串提取出来
            content = div.xpath(
                './a[1]/div/span//text()|./div[1]/span/h2/text()').extract()
            #content = content.strip('\n')

            content = ''.join(content)

            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content

            # 将item提交给pipeline
            yield item