Exemplo n.º 1
0
    def parse_detail(self, response):
        article_item = JobboleArticleItem()

        # title = response.css('div.entry-header > h1::text').extract_first()
        # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·','').strip()
        # praise_num = response.css('.post-adds .vote-post-up h10::text').extract_first(0)
        front_img_url = response.meta.get('front_img_url', '')
        #
        # fav_num_info = response.css('.post-adds .bookmark-btn::text').extract_first()
        # fav_num_re = re.match(".*(\d+).*", fav_num_info)
        # if fav_num_re:
        #     fav_num = fav_num_re.group(1)
        # else:
        #     fav_num = 0
        # comment_num_info = response.css('a[href="#article-comment"] span::text').extract_first()
        # comment_num_re = re.findall("\d+",comment_num_info)
        # if comment_num_re:
        #     comment_num = comment_num_re[0]
        # else:
        #     comment_num = 0
        #
        # tag_list = response.css('.entry-meta .entry-meta-hide-on-mobile a::text').extract()
        # tags = ','.join([tag for tag in set(tag_list) if not tag.strip().endswith('评论')])
        # content = response.css('.entry').extract_first()
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['url'] = response.url
        # article_item['title'] = title
        # try:
        #     create_date = datetime.strptime(create_date,'%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.now()
        # article_item['create_date'] = create_date
        # article_item['praise_num'] = praise_num
        # article_item['fav_num'] = fav_num
        # article_item['comment_num'] = comment_num
        # article_item['front_img_url'] = [front_img_url]
        # article_item['tags'] = tags
        # article_item['content'] = content

        #通过item loader价值item
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_css('title', 'div.entry-header > h1::text')
        item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text')
        item_loader.add_css('praise_num', '.post-adds .vote-post-up h10::text')
        item_loader.add_css('fav_num', '.post-adds .bookmark-btn::text')  #re
        item_loader.add_css('comment_num',
                            'a[href="#article-comment"] span::text')  #re
        item_loader.add_css(
            'tag', '.entry-meta .entry-meta-hide-on-mobile a::text')  #处理函数
        item_loader.add_css('content', '.entry')
        item_loader.add_value('front_img_url', [front_img_url])
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))

        article_item = item_loader.load_item()

        yield article_item
Exemplo n.º 2
0
    def parse_content(self, response):
        # 通过css选择器提取数据
        # front_image_url = response.meta.get("front_image_url", "") #文章封面图
        # title = response.css('.entry-header h1::text').extract_first()
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace("·","").strip()
        # praise_num = response.css('.vote-post-up h10::text').extract_first() #点赞数
        # fav_num = response.css('.bookmark-btn::text').extract_first() #收藏数
        # match_re = re.match(".*?(\d+).*", fav_num)
        # if match_re:
        #     fav_num = int(match_re.group(1))
        # else:
        #     fav_num = 0
        # comments_num = response.css('a[href="#article-comment"] span::text').extract_first() # 评论数
        # match_re = re.match(".*?(\d+).*", comments_num)  # 正则获取字符串中的数字
        # if match_re:
        #     comments_num = int(match_re.group(1))
        # else:
        #     comments_num = 0
        # content = response.css('div.entry').extract_first() # 正文
        # tag_selecter = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        # tag_list = [element for element in tag_selecter if not element.strip().endswith('评论')]
        # tags = ",".join(tag_list)  # 标签
        #
        # article_item = JobboleArticleItem()
        # article_item["title"] = title
        # try:
        #     create_date = datetime.strptime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["url"] = response.url
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_num
        # article_item["comment_nums"] = comments_num
        # article_item["fav_nums"] = fav_num
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过item loader加载item  使用自定义的loader:ArticleItemLoader 由list变成str
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response = response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Exemplo n.º 3
0
    def parse_article(self, response):
        """

        :param response:
        :return:
        """

        job_article_instance = JobboleArticleItem()

        front_img_url = response.meta.get("front_img_url", None)
        #print(front_img_url)
        title = response.css(".entry-header h1::text").extract()[0]
        create_date = response.css(
            "p.entry-meta-hide-on-mobile::text").extract()[0].strip()
        like_num = response.css(".vote-post-up h10::text").extract()[0]
        record_num = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(".*?(\d+).*", record_num)
        if match_re:
            record_num = int(match_re.group(1))
        else:
            record_num = 0

        comment_num = response.css(
            "a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(".*?(\d+).*", comment_num)
        if match_re:
            comment_num = int(match_re.group(1))
        else:
            comment_num = 0
        content = response.css("div.entry").extract()[0]

        tags = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        tags = "".join(tags)

        job_article_instance["title"] = title
        job_article_instance["create_date"] = create_date
        job_article_instance["url"] = response.url
        job_article_instance["front_img_url"] = front_img_url
        job_article_instance["like_num"] = like_num
        job_article_instance["record_num"] = record_num
        job_article_instance["comment_num"] = comment_num
        job_article_instance["tags"] = tags
        job_article_instance["content"] = content

        yield job_article_instance
    def parse_detail(self, response):
        #提取文章具体字段(xpath)
#         title = response.xpath('//*[@id="post-113789"]/div[1]/h1/text()').extract()[0]
#         
#         create_date = response.xpath('//*[@id="post-113789"]/div[2]/p/text()[1]').extract()[0].strip().replace('·', '').strip()
#         
#         praise_nums = response.xpath('//*[@id="113789votetotal"]/text()').extract()
#         if praise_nums:
#             praise_nums = int(praise_nums[0])
#         else:
#             praise_nums = 0
#         
#         fav_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/span[2]/text()').extract()[0]
#         match_re = re.match(r'.*?(\d+).*', fav_nums)
#         if match_re:
#             fav_nums = int(match_re.group(1))
#         else:
#             fav_nums = 0
#         
#         comment_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/a/span/text()').extract()[0]
#         match_re = re.match(r'.*?(\d+).*', comment_nums)
#         if match_re:
#             comment_nums = int(match_re.group(1))
#         else:
#             comment_nums = 0
#         
#         content = response.xpath('//*[@id="post-113789"]/div[3]').extract()[0]
#         
#         tag_list = response.xpath('//*[@id="post-113789"]/div[2]/p/a/text()').extract()
#         tag_list = [element for element in tag_list if not element.strip().endswith('评论')] 
#         tags = ','.join(tag_list)
        
        
        
        #以下通过css选择器提取字段
#         article_item = JobboleArticleItem()  #实例化
#         
#         front_image_url = response.meta.get('front_image_url', '')  #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空)
#         #文章封面图
#         
#         title = response.css('.entry-header h1::text').extract()[0]
#         
#         create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·', '').strip()
#         
#         praise_nums = response.css('.vote-post-up h10::text').extract_first()
#         if praise_nums:
#             praise_nums = int(praise_nums[0])
#         else:
#             praise_nums = 0
#         
#         fav_nums = response.css('.bookmark-btn::text').extract()[0]
#         match_re = re.match(r'.*?(\d+).*', fav_nums)
#         if match_re:
#             fav_nums = int(match_re.group(1))
#         else:
#             fav_nums = 0
#         
#         comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
#         match_re = re.match(r'.*?(\d+).*', comment_nums)
#         if match_re:
#             comment_nums = int(match_re.group(1))
#         else:
#             comment_nums = 0
#         
#         content = response.css("div.entry").extract()[0]
#         
#         tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
#         tag_list = [element for element in tag_list if not element.strip().endswith('评论')] 
#         tags = ','.join(tag_list)
#         
#         #填充值到items
#         article_item['title'] = title
#         article_item['url'] = response.url
#         article_item['url_object_id'] = get_md5(response.url)  #对url做MD5
#         
#         try:  #为了将文章的创建时间写入数据库,要把str类型的create_time转换为date类型
#             create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date()  #将格式为%Y/%m/%d 的str类型转换为date类型
#         except Exception as e:
#             create_date = datetime.datetime.now().date()
#         article_item['create_date'] = create_date
#         
#         article_item['front_image_url'] = [front_image_url]  #images需要接受一个数组
#         article_item['praise_nums'] = praise_nums
#         article_item['fav_nums'] = fav_nums
#         article_item['comment_nums'] = comment_nums
#         article_item['tags'] = tags
#         article_item['content'] = content
        
        #通过itemLoader加载item
        front_image_url = response.meta.get('front_image_url', '')  #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空)
        #item_loader = ItemLoader(item=JobboleArticleItem(), response=response)  #定义ItemLoader实例
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)  #改用自定义的 ItemLoader
#         ItemLoader.add_css(self, field_name, css)
#         ItemLoader.add_xpath(self, field_name, xpath)
#         ItemLoader._add_value(self, field_name, value)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        
        article_item = item_loader.load_item()
        #调用默认的item方法的话会有两个问题:1.值都是list 2.还需要对取出的值行进处理(做re的提取等)
        #-->去修改items.py  #1.在items.py 的Field()里面用TakeFirst进行处理  2.在items.py 的Field()里面用MapCompose进行处理
        
        yield article_item  #调用yield之后,item会传递到pipelines.py
Exemplo n.º 5
0
    def parse_detail(self, response):
        """
        1、解析下载的详情页源码,从中提取数据
        :param response: 
        :return: 
        """
        # 利用基础的item实现的item
        # url = response.url
        # url_object_id = get_md5(response.url)
        # front_image_url = response.meta.get('front_image_url', '')
        # title = response.css('.entry-header h1::text').extract_first('')
        # create_data = response.css('.entry-meta-hide-on-mobile::text').re('.*?((\d{4})/(\d{1,2})/(\d{1,2})).*')[0]
        #
        # tags_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        # tags = '·'.join([tag for tag in tags_list if "评论" not in tag])
        #
        # thumbs_up = int(response.css('.vote-post-up h10::text').extract_first(0))
        #
        # collected = response.css('.bookmark-btn::text').re('(\d+)')
        # collected = collected[0] if collected else 0
        #
        # comments = response.css('.post-adds a span::text').re('.*?(\d+).*')
        # comments = comments[0] if comments else 0
        #
        # content = response.css('.entry').extract_first('')
        #
        # JobboleItem = JobboleArticleItem()
        #
        # JobboleItem['url'] = url
        # JobboleItem['url_object_id'] = url_object_id
        # JobboleItem['front_image_url'] = [front_image_url]    # 用scrapy自带的imagepipeline下载图片时,是循环获取图片链接。所以这里必须是可循环的对象
        # JobboleItem['title'] = title
        # JobboleItem['create_data'] = create_data
        # JobboleItem['tags'] = tags
        # JobboleItem['thumbs_up'] = thumbs_up
        # JobboleItem['collected'] = collected
        # JobboleItem['comments'] = comments
        # JobboleItem['content'] = content
        #
        # yield JobboleItem

        # 使用 Item Loader 加载item
        item_loader = JobboleArticleItemLoader(
            item=JobboleArticleItem(), response=response)  # 生成一个item loader 对象

        # 常用的添加规则的方法: add_css, add_xpath  直接添加值的方法: add_value

        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('front_image_url',
                              response.meta.get('front_image_url', ''))
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_css('create_data', '.entry-meta-hide-on-mobile::text')
        item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text')
        item_loader.add_css('thumbs_up', '.vote-post-up h10::text')
        item_loader.add_css('collected', '.bookmark-btn::text')
        item_loader.add_css('comments', '.post-adds a span::text')
        item_loader.add_css('content', '.entry')

        # 在添加完规则之后,要调用一下item_loader的load_item方法
        article_item = item_loader.load_item()

        yield article_item
Exemplo n.º 6
0
    def parse_detail(self, response):
        #提取文章具体字段
        #         title = response.xpath('//*[@id="post-113789"]/div[1]/h1/text()').extract()[0]
        #
        #         create_date = response.xpath('//*[@id="post-113789"]/div[2]/p/text()[1]').extract()[0].strip().replace('·', '').strip()
        #
        #         praise_nums = response.xpath('//*[@id="113789votetotal"]/text()').extract()
        #         if praise_nums:
        #             praise_nums = int(praise_nums[0])
        #         else:
        #             praise_nums = 0
        #
        #         fav_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/span[2]/text()').extract()[0]
        #         match_re = re.match(r'.*?(\d+).*', fav_nums)
        #         if match_re:
        #             fav_nums = int(match_re.group(1))
        #         else:
        #             fav_nums = 0
        #
        #         comment_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/a/span/text()').extract()[0]
        #         match_re = re.match(r'.*?(\d+).*', comment_nums)
        #         if match_re:
        #             comment_nums = int(match_re.group(1))
        #         else:
        #             comment_nums = 0
        #
        #         content = response.xpath('//*[@id="post-113789"]/div[3]').extract()[0]
        #
        #         tag_list = response.xpath('//*[@id="post-113789"]/div[2]/p/a/text()').extract()
        #         tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        #         tags = ','.join(tag_list)

        #以下通过css选择器提取字段
        article_item = JobboleArticleItem()  #实例化

        front_image_url = response.meta.get(
            'front_image_url',
            '')  #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空)
        #文章封面图

        title = response.css('.entry-header h1::text').extract()[0]

        create_date = response.css(
            'p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace(
                '·', '').strip()

        praise_nums = response.css('.vote-post-up h10::text').extract_first()
        if praise_nums:
            praise_nums = int(praise_nums[0])
        else:
            praise_nums = 0

        fav_nums = response.css('.bookmark-btn::text').extract()[0]
        match_re = re.match(r'.*?(\d+).*', fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css(
            "a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(r'.*?(\d+).*', comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        content = response.css("div.entry").extract()[0]

        tag_list = response.css(
            "p.entry-meta-hide-on-mobile a::text").extract()
        tag_list = [
            element for element in tag_list
            if not element.strip().endswith('评论')
        ]
        tags = ','.join(tag_list)

        #填充值到items
        article_item['title'] = title
        article_item['url'] = response.url
        article_item['url_object_id'] = get_md5(response.url)  #对url做MD5
        article_item['create_date'] = create_date
        article_item['front_image_url'] = [front_image_url]  #images需要接受一个数组
        article_item['praise_nums'] = praise_nums
        article_item['fav_nums'] = fav_nums
        article_item['comment_nums'] = comment_nums
        article_item['tags'] = tags
        article_item['content'] = content

        yield article_item  #调用yield之后,item会传递到pipelines.py

        pass
Exemplo n.º 7
0
    def parse_detail(self, response):
        # 提取文章的具体字段
        article_item = JobboleArticleItem()

        # 图片
        image = response.meta.get("front_img", "")
        #
        # # 标题
        # title = response.xpath("//div[@class='entry-header']/h1/text()")
        # title_result = title.extract_first("")
        # # 创建时间
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()")
        # create_date_result = create_date.extract()[0].replace("·", "").strip()
        # # 点赞数
        # praise_num = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()")
        # praise_num_result = int(praise_num.extract()[0])
        # # 收藏数
        # fav_num = response.xpath("//span[contains(@class,'bookmark-btn')]/text()")
        # match_re = re.match(r".*?(\d+).*", fav_num.extract()[0])
        # if match_re:
        #     fav_num = int(match_re.group(1))
        # else:
        #     fav_num = 0
        #
        # # 评论数
        # comment_num = response.xpath("//a[@href='#article-comment']/span/text()")
        # match_re = re.match(".*?(\d+).*", comment_num.extract()[0])
        # if match_re:
        #     comment_num = int(match_re.group(1))
        # else:
        #     comment_num = 0
        # # 文章内容
        # # article_content=response.xpath("//")
        # content = response.xpath("//div[@class='entry']//text()").extract()
        # content_data = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # content_key = [content_key for content_key in content_data if not content_key.strip().endswith("评论")]
        # content_keys = ','.join(content_key)
        #
        # article_item['url'] = response.url
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['front_img_url'] = [image]
        # article_item['title'] = title_result
        # try:
        #     create_date_result = datetime.datetime.strptime(create_date_result, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date_result = datetime.datetime.now()
        # article_item['create_time'] = create_date_result
        # article_item['praise_num'] = praise_num_result
        # article_item['fav_num'] = fav_num
        # article_item['comment_num'] = comment_num
        # article_item['content'] = content
        # article_item['tags'] = content_keys

        # 通过item loader 加载item
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)
        item_loader.add_value('url', response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_img_url", [image])
        item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()")
        item_loader.add_xpath("create_time", "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_xpath("praise_num", "//span[contains(@class,'vote-post-up')]/h10/text()")
        item_loader.add_xpath("fav_num", "//span[contains(@class,'bookmark-btn')]/text()")
        item_loader.add_xpath("comment_num", "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath("content", "//div[@class='entry']//text()")
        item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")

        article_item= item_loader.load_item()

        yield article_item