Exemplo n.º 1
0
    def parse_detail(self, response):
        # article_content = response.css('.article_content #content').extract()
        # article_content = response.xpath('//div[@class="article_content"]/div[@id="content"]').extract()
        # original_url = response.css('.article_detail a::attr(href)').extract_first()
        # # original_url = response.xpath('//div[@class="article_detail"]/a/@href').extract_first()
        tags = response.css('.article_more a::text').extract()
        if tags:
            tags = tags
        else:
            tags = '无'
        # # tags = response.xpath('//*[@class="article_more"]/a/text()').extract()

        item_loader = ItemLoader(item=ZakerItem(),
                                 response=response,
                                 dont_filter=True)

        item_loader.add_value('url_id', get_md5(response.url))
        item_loader.add_value('article_url', response.url)
        item_loader.add_value('title', response.meta.get('title'))
        item_loader.add_value('media', response.meta.get('media'))
        item_loader.add_value('comments_num',
                              response.meta.get('comments_num'))
        item_loader.add_value('img_url', response.meta.get('img_url'))
        item_loader.add_css('article_content', '.article_content #content')
        item_loader.add_css('original_url', '.article_detail a::attr(href)')
        item_loader.add_value('tags', tags)
        # item_loader.add_value('parse_time', datetime.datetime.now())
        article_item = item_loader.load_item()
        yield article_item
Exemplo n.º 2
0
    def parse_detail(self, response):
        article_item = JobboleItem()
        #文章封面图地址
        front_image_url = response.meta.get("front_image_url", "")
        title = response.xpath(
            '//div[@class="entry-header"]/h1/text()').extract_first()
        create_date = response.xpath(
            '//p[@class="entry-meta-hide-on-mobile"]/text()').extract(
            )[0].strip().split()[0]

        tag_list = response.xpath(
            '//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        tag_list = [
            element for element in tag_list
            if not element.strip().endswith("评论")
        ]
        tag = ",".join(tag_list)
        praise_nums = response.xpath(
            '//span[contains(@class,"vote-post-up")]/h10/text()').extract()
        if len(praise_nums) == 0:
            praise_nums = 0
        else:
            praise_nums = int(praise_nums[0])
        fav_nums = response.xpath(
            '//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
        match_re = re.match(".*(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0
        comment_nums = response.xpath(
            "//a[@href='#article-comment']/span/text()").extract()[0]
        match_com = re.match(".*(\d+).*", comment_nums)
        if match_com:
            comment_nums = int(match_com.group(1))
        else:
            comment_nums = 0

        content = response.xpath('//div[@class="entry"]').extract()[0]

        article_item["url_object_id"] = get_md5(response.url)  #这里对地址进行了md5变成定长
        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date,
                                                     '%Y/%m/%d').date()
        except Exception as e:
            create_date = datetime.datetime.now().date()

        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = int(praise_nums)
        article_item["fav_nums"] = fav_nums
        article_item["comment_nums"] = comment_nums
        article_item["tag"] = tag
        article_item['content'] = content

        yield article_item
Exemplo n.º 3
0
    def parse_detail(self, response):

        #-----------<<itemload方法>>------------------
        item_loader = ItemLoader(item = JobBoleArticleItem(),response=response)
        front_image_url = response.meta.get("front_image_url", "")
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        #item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()
        yield article_item
Exemplo n.º 4
0
    def parse_detail(self, response):
        """
        解析目标页面的内容
        :param response:
        :return:
        """
        # 方式一: 新建一个Item,直接往里面填value(不推荐,可扩展性不强)
        # article_item = JobBoleArticleItem()
        #
        # front_image_url = response.meta.get('front_img_url', '')
        # title = response.css('div.entry-header > h1::text').extract()[0]
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·', '').strip()
        # praise_nums = int(response.css('div.post-adds h10::text').extract()[0])
        # fav_nums = response.css('.bookmark-btn::text').extract()[0]
        # match_re = re.match('.*?(\d+).*', fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.css("a[href='#jobbole-comment'] > span::text").extract()[0]
        # match_re = re.match('.*?(\d+).*', comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.css('div.entry').extract()[0]
        # tags = '-'.join([tag for tag in response.css('p.entry-meta-hide-on-mobile a::text').extract() if not tag.endswith(" ")])
        #
        # article_item['url'] = response.url
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # try:
        #     create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date()
        # except:
        #     create_date = datetime.datetime.now().date()
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = praise_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['comment_nums'] = comment_nums
        # article_item['content'] = content
        # article_item['tags'] = tags

        # 方式二: 通过ItemLoader加载Item,建立processors流水线来处理数据
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('title', 'div.entry-header > h1::text')
        item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text')
        item_loader.add_value('front_image_url',
                              [response.meta.get('front_img_url', '')])
        item_loader.add_css('praise_nums', 'div.post-adds h10::text')
        item_loader.add_css('fav_nums', '.bookmark-btn::text')
        item_loader.add_css('comment_nums',
                            "a[href='#jobbole-comment'] > span::text")
        item_loader.add_css('content', 'div.entry')
        item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text')

        article_item = item_loader.load_item()

        yield article_item
Exemplo n.º 5
0
    def parse_detail(self, response):
        '''
        article_item = JobboleItem()
        # 提取文章的具体字段

        # 文章图片
        front_image_url = response.meta.get("front_image_url", "")

        #文章标题
        title = response.css('.entry-header h1::text').extract_first()
        # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first()

        # 创建时间
        create_time = response.css('.entry-meta-hide-on-mobile::text').extract_first().strip().replace(' ·', '')
        # create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '')

        # 点赞数量
        vote_nums = response.css('.vote-post-up h10::text').extract_first()
        # vote_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first()
        if vote_nums:
            vote_nums = int(vote_nums)
        else:
            vote_nums = 0

        # 收藏数量
        mark_nums = response.css('.bookmark-btn::text').extract_first()
        # mark_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract_first()
        match_re = re.match('.*?(\d+).*?', mark_nums)
        if match_re:
            mark_nums = int(match_re.group(1))
        else:
            mark_nums = 0

        # 评论数量
        comment_nums = response.css('.btn-bluet-bigger.href-style.hide-on-480::text').extract_first()
        # comment_nums = response.xpath('//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()').extract_first()
        match_re = re.match('.*?(\d+).*?', comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        # 文章内容,这里只提取HTML
        connent = response.css('.entry').extract_first()
        # content = response.xpath('//div[@class="entry"]').extract_first()

        # 标签
        tags_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        # tags = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        tags_list = [element for element in tags_list if not element.strip().endswith("评论")]
        tags = ','.join(tags_list)

        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_time = datetime.datetime.strptime(create_time, '%Y/%m/%s').date()
        except Exception as e:
            create_time = datetime.datetime.now().date()
        article_item["create_time"] = create_time
        article_item["vote_nums"] = vote_nums
        article_item["mark_nums"] = mark_nums
        article_item["comment_nums"] = comment_nums
        article_item["content"] = connent
        article_item["tags"] = tags
        article_item["front_image_url"] = [front_image_url]
        article_item["url_object_id"] = get_md5(response.url)
        '''
        # 通过itemloader加载item
        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=JobboleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        # item_loader.add_xpath()
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('create_time', '.entry-meta-hide-on-mobile::text')
        item_loader.add_css('vote_nums', '.vote-post-up h10::text')
        item_loader.add_css('mark_nums', '.bookmark-btn::text')
        item_loader.add_css('comment_nums', '.btn-bluet-bigger.href-style.hide-on-480::text')
        item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text')
        item_loader.add_css('content', '.entry')
        article_item = item_loader.load_item()
        yield article_item
Exemplo n.º 6
0
    def parse_detail(self, response):
        article_item = JobboleItem()

        # 提取文章的具体字段
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        #
        # content = response.xpath("//div[@class='entry']").extract()[0]
        #
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        # 通过css选择器提取字段
        # front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract()[0]
        #
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # # 把字符串日期转换为date对象
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = int(praise_nums)
        # article_item['comment_nums'] = comment_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['tags'] = tags
        # article_item['content'] = content
        # print(article_item)

        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        # ArticleItemLoader是自定义的itemloader,只取数组第一个
        item_loader = ArticleItemLoader(item=JobboleItem(), response=response)
        # item_loader = ItemLoader(item=JobboleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item