Пример #1
0
    def parse_detail(self, response):
        # article_item = JobBoleArticleItem()  # 实例化后用于填充

        # 提取文章的具体字段

        # 新闻标题
        # title = response.xpath('//div[@class="entry-header"]/h1/text()')

        # 发布日期
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·',
        # '').strip()
        # 点赞数
        # praise_nums = int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0])

        # 收藏数
        # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
        # match_re = re.match('.*?(\d+).*', fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        # else:
        #     fav_nums = 0

        # 评论数
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
        # match_re = re.match('.*?(\d+).*', comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        # else:
        #     comment_nums = 0

        # 正文内容
        # content = response.xpath('//div[@class="entry"]').extract()[0]

        # 标签
        # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]  # 将标签中出现的评论部分过滤掉
        # tags = ",".join(tag_list)

        # -------通过css提取字段-----------------------------------------------------------------------------------------
        # title = response.css('.entry-header h1::text').extract_first("")  # h1::text 伪类选择器
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·', '').strip()
        # praise_nums = response.css('.vote-post-up h10::text').extract()[0]
        #
        # fav_nums = response.css('.bookmark-btn::text').extract()[0]
        # match_re = re.match('.*?(\d+).*', fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css('a[href="#article-comment"] span::text').extract()[0]
        # match_re = re.match('.*?(\d+).*', comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css('div.entry').extract()[0]
        #
        # tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]  # 将标签中出现的评论部分过滤掉
        # tags = ",".join(tag_list)

        # -------将item送去pipeline进行数据的存储操作---------------------------------------------------------------------
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.datetime.now()
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = praise_nums
        # article_item['comment_nums'] = comment_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['tags'] = tags
        # article_item['content'] = content

        # -------通过自定义的ArticleItemLoader加载item---------------------------------------------------------------------------------
        front_image_url = response.meta.get('front_image_url', '')  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text')
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('praise_nums', '.vote-post-up h10::text')
        item_loader.add_css('comment_nums',
                            'a[href="#article-comment"] span::text')
        item_loader.add_css('fav_nums', '.bookmark-btn::text')
        item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text')
        item_loader.add_css('content', 'div.entry')

        article_item = item_loader.load_item()
        yield article_item  # 将配置好的item传送到pipelines
Пример #2
0
    def parse_detail(self, response):

        # 实例化JobBoleArticleItem
        article_item = JobBoleArticleItem()

        # 提取文章的具体字段
        # use CSS Selector to locate Element

        # 获取文章封面图
        front_image_url = response.meta.get("front_image_url", "")

        # get title
        title = response.css(".entry-header h1::text").extract()[
            0]  # CSS伪类选择器::

        create_date = response.css(
            "p.entry-meta-hide-on-mobile::text").extract()[0].replace(
                "·", "").strip()  # 处理/r/n空格,处理点号,处理空格

        praise_nums = response.css(".vote-post-up h10::text").extract()[
            0]  # ' 2 收藏'

        fav_nums = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(r".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css(
            "a[href='#article-comment'] span::text").extract()[0]  # ' 2 评论'
        match_re = re.match(r".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        content = response.css("div.entry").extract()[0]

        # tag = response.css("p.entry-meta-hide-on-mobile a::text").extract()[0]    # '开发'
        tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract(
        )  # ['开发', ' 2 评论 ', '数据科学', '机器学习']

        tag_list = [
            element for element in tag_list
            if not element.strip().endswith("评论")
        ]
        tags = ",".join(tag_list)  # '开发,数据科学,机器学习'

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title  # in items.py
        article_item["url"] = response.url

        # need to convert create_date str to date
        try:
            create_date = datetime.datetime.strptime(create_date,
                                                     "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()

        article_item["create_date"] = create_date

        article_item["front_image_url"] = [front_image_url
                                           ]  # [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content

        # 通过item loader加载item
        item_loader = ItemLoader(item=JobBoleArticleItem(), response=response)
        # item_loader.add_xpath()
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        # call item
        article_item = item_loader.load_item()

        # call yield , article_item will transfer to pipelines
        yield article_item
Пример #3
0
    def parse_datail(self, response):
        article_item = JobBoleArticleItem()
        # # 使用xpath方式爬取内容
        # # 标题
        # title = response.xpath('//*[@id="post-114610"]/div[1]/h1/text()').extract()[0]
        #
        # # 时间
        # create_date = response.xpath('//*[@id="post-114610"]/div[2]/p/text()').extract()[0].strip().replace(' ·', '')
        # tag_list = response.xpath('//*[@id="post-114610"]/div[2]/p/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        # tags = ','.join(tag_list)
        #
        # # 点赞数
        # praise_nums = int(response.xpath('//*[@id="post-114610"]/div[3]/div[3]/span[1]/h10/text()').extract()[0])
        #
        # # 收藏数
        # fav_nums = response.xpath('//*[@id="post-114610"]/div[3]/div[3]/span[2]/text()').extract()[0]
        # match_re = re.match('.*?(\d+).*', fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        # else:
        #     fav_nums = 0
        #
        # # 评论数
        # comment_nums = response.xpath('//*[@id="post-114610"]/div[3]/div[3]/a/span/text()').extract()[0]
        # match_re = re.match('.*?(\d+).*', comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        # else:
        #     comment_nums = 0
        #
        # # 正文
        # cotent = response.xpath('//*[@id="post-114610"]/div[3]').extract()[0]

        # # 使用css方式爬取内容
        # # 图片获取
        # front_image_url = response.meta.get('front_image_url', '')
        #
        # # 标题
        # title = response.css('.entry-header h1::text').extract()[0]
        #
        # # 时间
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace(' ·', '')
        # tag_list = response.css('.entry-meta a::text').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        # tags = ','.join(tag_list)
        #
        # # 点赞数
        # praise_nums = int(response.css('.vote-post-up h10::text').extract()[0])
        # if praise_nums:
        #     praise_nums = praise_nums
        # else:
        #     praise_nums = 0
        #
        # # 收藏数
        # fav_nums = response.css('.bookmark-btn::text').extract()[0]
        # match_re = re.match('.*?(\d+).*', fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # # 评论数
        # comment_nums = response.css('a[href="#article-comment"] span::text').extract_first("")
        # match_re = re.match('.*?(\d+).*', comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # # 正文
        # content = response.css('div.entry').extract()[0]
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # try:
        #     create_date = datetime.datetime.strftime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item['create_date'] = create_date
        # article_item['url'] = response.url
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = praise_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['comment_nums'] = comment_nums
        # article_item['tags'] = tags
        # article_item['content'] = content

        # 图片获取
        front_image_url = response.meta.get('front_image_url', '')
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text')
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_value('front_image_path',
                              ArticleImagePipeline.item_completed)
        item_loader.add_css('praise_nums', '.vote-post-up h10::text')
        item_loader.add_css('comment_nums',
                            'a[href="#article-comment"] span::text')
        item_loader.add_css('fav_nums', '.bookmark-btn::text')
        item_loader.add_css('tags', '.entry-meta a::text')
        item_loader.add_css('content', 'div.entry')

        article_item = item_loader.load_item()

        yield article_item
Пример #4
0
    def parse_detail(self, response):
        match_re = re.match(".*?(\d+)", response.url)
        if match_re:
            post_id = match_re.group(1)
            """
            article_item = JobboleArticleItem()
            title = response.css("#news_title a::text").extract_first("")
            # title = response.xpath('//*[@id="news_title"]//a/text()')
            create_data = response.css("#news_info .time::text").extract_first("")
            match_re = re.match(".*?(\d+.*)", create_data)
            if match_re:
                create_date = match_re.group(1)
                # create_date = response.xpath('//*[@id="news_info"]//*[@class="time"]/text()')
            content = response.css("#news_content").extract()[0]
            # content = response.xpath('//*[@id="news_content"]').extract()[0]
            tag_list = response.css(".news_tags a::text").extract()
            # tag_list = response.xpath('//*[@class="news_tags"]//a/text()').extract()
            tags = ",".join(tag_list)
            """
            '''
            同步请求代码,在并发要求不是很高时可以采用
            post_id = match_re.group(1)
            html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
            j_data = json.loads(html.text)
            '''
            """
            article_item["title"] = title
            article_item["create_date"] = create_date
            article_item["content"] = content
            article_item["tags"] = tags
            article_item["url"] = response.url
            # 报错:ValueError:Missing scheme in request url:h
            # 上述报错原因:对于图片下载的字段一定要使用list类型,故[response.meta.get("front_image_url", "")]
            if response.meta.get("front_image_url", ""):
                article_item["front_image_url"] = [response.meta.get("front_image_url", "")]
            else:
                article_item["front_image_url"] = []
            """

            item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                            response=response)
            item_loader.add_css("title", "#news_title a::text")
            item_loader.add_css("create_date", "#news_info .time::text")
            item_loader.add_css("content", "#news_content")
            item_loader.add_css("tags", ".news_tags a::text")
            item_loader.add_value("url", response.url)
            if response.meta.get("front_image_url", []):
                item_loader.add_value("front_image_url",
                                      response.meta.get("front_image_url", []))

            # article_item = item_loader.load_item()
            print(
                parse.urljoin(
                    response.url,
                    "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
            yield Request(url=parse.urljoin(
                response.url,
                "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                          meta={
                              "article_item": item_loader,
                              "url": response.url
                          },
                          callback=self.parse_nums)
Пример #5
0
    def parse_detail(self, response):
        """
        提取文章的具体字段
        """
        # article_item = JobBoleArticleItem()
        #
        #
        # title = response.css('.entry-header h1::text').extract_first()
        #
        # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace('·','').strip()
        #
        # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith('评论')]
        # tags = ",".join(tag_list)
        #
        # praise_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first() or 0)
        #
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first()
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first()
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.xpath("//div[@class='entry']").extract_first()
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = praise_nums
        # article_item['comment_nums'] = comment_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['tags'] = tags
        # article_item['content'] = content

        # 通过itemloader加载item
        front_image_url = response.meta.get("front_image_url", "") # 封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        item_loader.add_value("front_image_url",  [front_image_url])
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        article_item = item_loader.load_item()

        yield article_item
Пример #6
0
    def parse_detail(self,response):
        article_item = JobBoleArticleItem()
        '''
        处理文章内容
        :param response: 
        :return: 
        '''
        # 文章封面图
        image_url = response.meta.get('image_url','')
        # # 标题
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # # 创建时间
        # create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·','').strip()
        # # 标签
        # tag_lists = response.xpath('//p[contains(@class,"entry-meta-hide-on-mobile")]/a/text()').extract()
        # tag_lists = [i for i in tag_lists if not i.strip().endswith('评论')]
        # tag_lists = ','.join(tag_lists)
        # # 点赞数
        # praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()
        # if len(praise_nums) == 0:
        #     praise_nums = 0
        # else:
        #     praise_nums = praise_nums[0]
        #
        # # 收藏数
        # shoucang_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
        #
        # match_re = re.match(r'.*(\d+).*', shoucang_nums)
        # if match_re:
        #     shoucang_nums = match_re.group(1)
        # else:
        #     shoucang_nums = 0
        #
        # # 评论数
        # comment_nums = response.xpath('//span[contains(@class,"hide-on-480")]/text()').extract()[0]
        #
        # match_re = re.match(r'.*(\d+).*', comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        # else:
        #     comment_nums = 0
        #
        # # 内容
        # content = response.xpath('//div[@class="entry"]').extract()[0]
        #
        # article_item['title'] = title
        # article_item['url'] = response.url
        # try:
        #     article_item['create_time'] = datetime.strptime(create_time,'%Y/%m/%d %H:%M:%S')
        # except Exception as e:
        #     article_item['create_time'] = datetime.now()
        # article_item['image_url'] = [image_url]
        # article_item['praise_nums'] = int(praise_nums)
        # article_item['shoucang_nums'] = int(shoucang_nums)
        # article_item['comment_nums'] = int(comment_nums)
        # article_item['content'] = content
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['tag_lists'] = tag_lists

        # 通过itemloader来加载实例
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),response=response)
        # item_loader.add_css()
        # item_loader.add_xpath()
        item_loader.add_xpath('title','//div[@class="entry-header"]/h1/text()')
        item_loader.add_value('url',response.url)
        item_loader.add_value('url_object_id',get_md5(response.url))
        item_loader.add_xpath('create_time','//p[@class="entry-meta-hide-on-mobile"]/text()')
        item_loader.add_value('image_url',[image_url])
        item_loader.add_value('image_path',image_url)
        item_loader.add_xpath('praise_nums','//span[contains(@class,"vote-post-up")]/h10/text()')
        item_loader.add_xpath('comment_nums','//span[contains(@class,"hide-on-480")]/text()')
        item_loader.add_xpath('shoucang_nums','//span[contains(@class,"bookmark-btn")]/text()')
        item_loader.add_xpath('tag_lists','//p[contains(@class,"entry-meta-hide-on-mobile")]/a/text()')
        item_loader.add_xpath('content','//div[@class="entry"]')

        article_item = item_loader.load_item()

        yield article_item
Пример #7
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #         comment_nums = match_re.group(1)
        # content = response.xpath("//div[@class='entry']").extract()[0]
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        title = response.css(".entry-header h1::text").extract()[0]
        create_date = response.css(
            "p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace(
                "·", "").strip()
        praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        fav_nums = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css(
            "a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        content = response.css("div.entry").extract()[0]

        tag_list = response.css(
            "p.entry-meta-hide-on-mobile a::text").extract()
        tag_list = [
            element for element in tag_list
            if not element.strip().endswith("评论")
        ]
        tags = ",".join(tag_list)

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date,
                                                     "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content
        yield article_item
Пример #8
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        # 提取文章的具体字段

        title = response.xpath(
            '//*[@class="entry-header"]/h1/text()').extract_first().strip()
        # extract_first() 相当于 extract()[0]且自带默认空值,不会报错
        # css选择器写法 response.css(".entry-header h1::text").extract_first().strip()

        create_date = response.xpath(
            "//p[@class='entry-meta-hide-on-mobile']/text()").extract_first(
            ).replace("·", "").strip()
        # css选择器写法response.css("p.entry-meta-hide-on-mobile::text").extract()[0].replace("·", "").strip()

        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图

        dianzan_nums = response.xpath(
            "//div[@class='post-adds']/span[1]/h10[1]/text()").extract_first(
            ).strip()
        # css选择器写法response.css(".vote-post-up h10::text").extract_first()

        match_re = re.match(
            '.*?(\d+).*',
            response.xpath("//div[@class='post-adds']/span[2]/text()").
            extract_first().strip())
        # css选择器写法response.css(".bookmark-btn::text").extract_first()
        shoucang_nums = '0'
        if match_re:
            shoucang_nums = match_re.group(1)

        match_re = re.match(
            '.*?(\d+).*',
            response.xpath("//div[@class='post-adds']/a[1]/span[1]/text()").
            extract_first().strip())
        # css选择器写法response.css('a[href="#article-comment"] span::text').extract_first()
        pinglun_nums = '0'
        if match_re:
            pinglun_nums = match_re.group(1)

        tag_list = response.xpath(
            "//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # css选择器写法response.css("p.entry-meta-hide-on-mobile a::text").extract()
        tag_list = [
            element for element in tag_list
            if not element.strip().endswith("评论")
        ]
        tags = ",".join(tag_list)

        content_list = response.css("div.entry ::text").extract()
        content = "".join(content_list).strip()

        article_item["title"] = title
        try:
            create_date = datetime.datetime.strptime(create_date,
                                                     '%Y/%m/%d').date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        article_item["url"] = response.url
        article_item["url_object_id"] = get_md5(response.url)
        article_item["front_image_url"] = {front_image_url}
        article_item["dianzan_nums"] = int(dianzan_nums)
        article_item["shoucang_nums"] = int(shoucang_nums)
        article_item["pinglun_nums"] = int(pinglun_nums)
        article_item["tags"] = tags
        article_item["content"] = content

        yield article_item
        pass
Пример #9
0
    def parse_detail(self, response):
        #导入后进行初始化
        article_item = JobBoleArticleItem()
        # #提取图片(文章封面图)
        # front_image_url = response.meta.get("front_image_url","")
        # #提取标题
        # title= response.css(".entry-header h1 ::text").extract()[0]
        # #小标题提取(时间节点)
        # create_date=response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·"," ").strip()
        #
        # #提取点赞数
        # praise_nums = response.css("span.vote-post-up h10::text").extract()[0]
        # #底部收藏数的提取
        # fav_nums= response.css("span.bookmark-btn ::text").extract()[0]
        # match_re=re.match(".*?(\d+).*","fav_nums")
        # if match_re:
        #      fav_nums=int(match_re.group(1))
        # else:
        #     fav_nums=0
        # # 底部评论数的提取
        # comm_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", "comm_nums")
        # if match_re:
        #      comm_nums = int(match_re.group(1))
        # else:
        #     comm_nums=0
        # #主体正文内容
        # content = response.css("div.entry").extract()[0]
        # # 小标题节点的提取,里面将某个节点去重进行
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        #
        #
        #
        # # title=response.xpath('//div[@class="entry-header"]/h1/text()')
        # # #小标题的提取(时间)
        # # create_date= response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·"," ").strip()
        # # #小标题节点的提取,里面将某个节点去重进行
        # # tag_list=response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # # tag_list=[element for element in tag_list if not element.strip().endswith("评论")]
        # # tags=",".join(tag_list)
        # # #底部点赞数的提取
        # # praise_nums=response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # # #底部收藏数的提取
        # # fav_nums=response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # # match_re1 = re.match(r".*?(\d+).*",fav_nums)
        # # if match_re1:
        # #     fav_nums=match_re1.group(1)
        # # #底部评论数的提取
        # # comm_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()
        # # match_re2 = re.match(".*?(\d+).*",comm_nums)
        # # if match_re2:
        # #     comm_nums = match_re2.group(1)
        # # #主体正文内容
        # # content=response.xpath("//div[@class='entry']").extract()[0]
        #
        # article_item["url_object_id"]=get_md5(response.url)
        # article_item["title"]=title
        # article_item["url"]=response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comm_nums"] = comm_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        #通过itemloader加载item
        front_image_url = response.meta.get("front_image_url", "")  #文章封面
        item_loader = ItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1 ::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comm_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Пример #10
0
    def parse_detail(self, response):
        # 提取文章的具体信息
        # article_item = JobBoleArticleItem()
        #
        # title = response.css(".entry-header h1::text").extract_first("")
        # front_image_url = response.meta.get('front_image_url', '')
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract_first("").strip().replace('·',
        #                                                                                                   '').strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract_first("")
        # fav_nums = response.css(".bookmark-btn::text").extract_first("")
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        # 	fav_nums = match_re.group(1)
        # else:
        # 	fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first("")
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        # 	comment_nums = match_re.group(1)
        # else:
        # 	comment_nums = 0
        #
        # content = response.css("div.entry").extract_first("")
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ','.join(tag_list)
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['title'] = title
        # article_item['url'] = response.url
        # try:
        # 	create_date = datetime.datetime.strtime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        # 	create_date = datetime.datetime.now().date()
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = praise_nums
        # article_item['comment_nums'] = comment_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['tags'] = tags
        # article_item['content'] = content

        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        # 跳转到piplines中
        yield article_item
Пример #11
0
    def parse_detail(self, response):

        #make an instance call article_item
        article_item = JobBoleArticleItem()

        # 提取文章的具体字段
        # #unilike array
        # #start from 1
        # #not working
        # #re_selector = response.xpath("/html/body/div[3]/div[3]/div[1]/div[1]/h1");
        # #this working
        # #//*[@id="post-110287"]/div[1]/h1
        # #id = xxx must be unique
        # #text() function to reject getting h1, get the text only
        # #re_selector = response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1");
        # #re_selector2 = response.xpath('//*[@id="post-110287"]/div[1]/h1/text()');
        #
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        #
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        # else:
        #     comment_nums = 0
        #
        # content = response.xpath("//div[@class='entry']").extract()[0]
        #
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        # via css to get the information
        # extract first is for the error checking

        #get the image url, get avoid error
        front_image_url = response.meta.get("front_image_url", "")

        #get title
        title = response.css(".entry-header h1::text").extract()[0]

        #get create day
        create_date = response.css(
            "p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace(
                "·", "").strip()
        praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        fav_nums = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css(
            "a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        content = response.css("div.entry").extract()[0]

        tag_list = response.css(
            "p.entry-meta-hide-on-mobile a::text").extract()
        tag_list = [
            element for element in tag_list
            if not element.strip().endswith("评论")
        ]
        tags = ",".join(tag_list)

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content

        #send to the pipeline.py, let it receive it
        yield article_item
Пример #12
0
    def parse_detail(self, response):
        '''
        # 这里相当于把实例化的item附给这里的一个变量,然后对这个变量进行赋值
        article_item = JobBoleArticleItem()
        # 直接复制xpath的情况
        title = response.xpath("/html/body/div[3]/div[1]/div[3]/div[1]/h1/text()").extract()[0]
        # 利用class来定位
        re2_selector = response.xpath('//div[@class="article-head"]/h1/text()')
        # 获取发表日期,strip是在去掉换行符,可以加.strip().replace(".","").strip()
        create_date = response.xpath('//div[@class="date"]/span/text()').extract()[0].strip()
        read_nums = response.xpath('//div[@class="about-left"]/span[2]/text()').extract()[0]
        regex_str = ".*?(\d+).*"
        match_obj = re.match(regex_str, read_nums)
        if match_obj:
            #print(match_obj.group(1))
            read_nums = int(match_obj.group(1))
        else:
            read_nums = 0

        content = response.xpath('//div[@class="article-main"]').extract()[0]
        front_image_url = response.meta.get("front_image_url", "")

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        #对日期格式做一个转换
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        #不加[]传进来的就是一个值,只有加上[]才是一个图片地址的数组
        article_item["front_image_url"] = [front_image_url]
        article_item["read_nums"] = read_nums
        article_item["content"] = content
        '''
        # 上面全注释,只用下面几行就够了,改造完成
        front_image_url = response.meta.get("front_image_url", "")
        # 通过itemloader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        # 将数据填充进来
        item_loader.add_css("title", ".article-head h1::text")
        #item_loader.add_xpath()
        #有些不是通过选择器,是直接通过response这种形式
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date",
                            ".about-left span:nth-child(1)::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("read_nums", '.about-left span:nth-child(2)::text')
        item_loader.add_xpath("content", '//div[@class="article-main"]')

        article_item = item_loader.load_item()

        #通过css选择器提取字段
        '''
        title = response.css(".article-head h1::text").extract()
        create_date = response.css(".about-left span::text").extract()[0]
        #span:nth-child(2)这样来指明是第几个span
        read_nums = response.css('.about-left span:nth-child(2)::text').extract()[0]
        print(read_nums)
        pass
        '''

        yield article_item
Пример #13
0
    def parse_func(self, response):
        #获取文章封面图片,用上述request代码的meta获取
        #FrontImageUrl = response.meta.get("FrontImageUrl","")
        # #获取文章标题
        # Title =  response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # #获取文章创建时间
        # CreateTime = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(" ·","")
        # try:
        #     CreateTime = datetime.datetime.strptime(CreateTime,"%Y/%m/%d")
        # except Exception as e:
        #     CreateTime = datetime.datetime.now().date()
        # #获取点赞数
        # VoteNum = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()
        # #如果数目为空则为0次
        # if VoteNum:
        #     VoteNum = int(VoteNum[0])
        # else:
        #     VoteNum = 0
        # #获取收藏数
        # BookMarkNum = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0].strip("收藏").strip()
        # if not BookMarkNum:
        #     BookMarkNum = 0
        # else:
        #     BookMarkNum = int(BookMarkNum)
        # #获取评论数
        # ArticleComment = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0].replace("评论","").strip()
        # if not ArticleComment:
        #     ArticleComment = 0
        # else:
        #     ArticleComment = int(ArticleComment)
        #获取文章内容
        #Content = response.css("div.entry").extract()[0]

        #实例化items的JobBoleArticleItem,并填充自定义的值
        # ArticleItem = JobBoleArticleItem()
        # ArticleItem["FrontImageMD5"] = get_md5(response.url)
        # ArticleItem["Title"] = Title
        # ArticleItem["URL"] = response.url
        # ArticleItem["CreateTime"] = CreateTime
        # #下载图片需要用集合形式的URL
        # ArticleItem["FrontImageUrl"] = FrontImageUrl
        # ArticleItem["VoteNum"] = VoteNum
        # ArticleItem["BookMarkNum"] = BookMarkNum
        # ArticleItem["ArticleComment"] = ArticleComment
        # ArticleItem["Content"] = Content
        #通过itemloader机制填充ArticleItem的值
        ArticleItem = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        ArticleItem.add_css("Title", ".entry-header h1::text")
        ArticleItem.add_value("URL", response.url)
        #ArticleItem.add_value("FrontImageMD5",get_md5(response.url))
        #ArticleItem.add_css("CreateTime","p.entry-meta-hide-on-mobile::text".strip().replace(" ·",""))
        ArticleItem.add_xpath(
            "CreateTime", '//p[@class="entry-meta-hide-on-mobile"]/text()')
        #ArticleItem.add_value("FrontImageUrl",[FrontImageUrl])
        ArticleItem.add_css("VoteNum", ".vote-post-up h10::text")
        ArticleItem.add_css("ArticleComment",
                            "a[href='#article-comment'] span::text")
        ArticleItem.add_css("BookMarkNum", ".bookmark-btn::text")
        ArticleItem.add_css("Content", "div.entry")
        ArticleItemLoder = ArticleItem.load_item()

        yield ArticleItemLoder
Пример #14
0
    def parse_detail(self, response):

        article_item = JobBoleArticleItem()

        # 封面图
        front_image_url = response.meta.get('front_image_url', '')
        """
        # 标题
        title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # 创建时间
        create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace(" ·", "")
        # 点赞数
        praise_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0])
        # 收藏
        fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        match_re = re.match(r".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0
        # 评论数
        comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        match_re = re.match(r".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0
        # 标签
        tag_list =  response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()
        tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        tags = ",".join(tag_list)
        # 内容
        content = response.xpath("//div[@class='entry']").extract()[0]
        
        article_item['title'] = title
        article_item['url'] = response.url
        article_item['url_object_id'] = get_md5(response.url)
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            # 获取当前日期
            create_date = datetime.datetime.now().date()
        article_item['create_date'] = create_date
        article_item['praise_nums'] = praise_nums
        # scrapy自动下载图片需要传列表
        article_item['front_image_url'] = [front_image_url]
        article_item['fav_nums'] = fav_nums
        article_item['comment_nums'] = comment_nums
        article_item['tags'] = tags
        article_item['content'] = content
        """

        """
        配置css、xpath、valur提取规则,简洁方便
        """
        # 通过item loader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        # item_loader.add_xpath()
        # item_loader.add_value()
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text")
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("content", "div.entry")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])

        # 解析规则
        article_item = item_loader.load_item()

        # 传递到pipelines中去
        yield article_item

        """
Пример #15
0
    def parse_detail(self, response):

        #实例化
        article_item = JobBoleArticleItem()

        #提取文章的具体字段
        #fire fox: /html/body/div[1]/div[3]/div[1]/div[1]/h1
        #re_selector = response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1")
        #re2_selector = response.xpath('//*[@id="post-114093"]/div[1]/h1/text()')
        '''title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]

        create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()

        praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]'''
        #praise_nums = response.xpath("//*[@id='114093votetotal']/text()").extract()[0]
        '''match_obj = re.match(r".*?(\d+).*$",response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0])'''
        #其中,(\d*)如果写成(\d+)的话,执行print(match_obj.group(1)),有可能会报错 AttributeError: 'NoneType' object has no attribute 'group'
        #因为有可能根本没有数字(根本没人收藏),致使正则表达式无法match
        '''if match_obj:
            fav_nums = match_obj.group(1)
        else:
            fav_nums = 0

        comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        match_obj = re.match(r".*?(\d+).*$",comment_nums)
        if match_obj:
            comment_nums = match_obj.group(1)
        else:
            comment_nums = 0

        content = response.xpath("//div[@class='entry']").extract()[0]

        tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        tags = ",".join(tag_list)'''

        #通过css选择器提取字段
        '''
        
        title = response.css(".entry-header h1::text").extract_first("没有元素")

        create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()

        praise_nums = response.css(".vote-post-up h10::text").extract_first("没有元素")
        praise_nums = int(praise_nums)

        fav_nums = response.css(".bookmark-btn::text").extract_first("没有元素")
        match_obj = re.match(r".*?(\d+).*$",fav_nums)
        if match_obj:
            fav_nums = int(match_obj.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css("a[href='#article-comment'] span::text").extract_first("没有元素")
        match_obj = re.match(r".*?(\d+).*$", comment_nums)
        if match_obj:
            comment_nums = int(match_obj.group(1))
        else:
            comment_nums = 0

        content = response.css("div.entry").extract_first("没有元素")

        tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        tags = ",".join(tag_list)

        #list0 = response.css("p.entry-meta-hide-on-mobile a::text")

        
        #list0 = response.css("p.entry-meta-hide-on-mobile a::text")
        #tag_list = []
        #for each in list0:
            #tag_list.append([each].extract_first("没有元素"))
            #AttributeError: list object has no attribute extract_first
            #下次可尝试用tag_list.append(each.css("").extract_first("没有元素"))
        #tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        #tags = ",".join(tag_list)

        article_item["url_object_id"] = get_md5(response.url)
        article_item["url"] = response.url
        article_item["title"] = title
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["fav_nums"] = fav_nums
        article_item["comment_nums"] = comment_nums
        article_item["tags"] = tags
        article_item["content"] = content
        '''

        #通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])  # 其实不用加[]的
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        #对item调用yield后, 这个item会传递到pipelines
        yield article_item
Пример #16
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()

        # 提取文章的具体字段
        # 1.XPath写法
        # 2.CSS选择器写法

        # 使用XPath
        '''
        # 标题
        title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # 日期
        create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip()
        # 点赞数
        praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]

        # 获取收藏数
        fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        match_re = re.match(".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = match_re.group(1)

        # 获取评论数
        comment_nums = response.xpath("//a[@href='#article-comment']/span").extract()[0]
        match_re = re.match(".*(\d+).*", comment_nums)
        if match_re:
            comment_nums = match_re.group(1)

        # 主体文章
        content = response.xpath("//div[@class='entry']").extract()[0]


        tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        tags = ",".join(tag_list)
        
        '''

        # 使用CSS选择器
        '''
        # 文章封面图
        front_image_url = response.meta.get("front_image_url", "")
        # 通过CSS选择器提取字段
        # 提取标题
        title = response.css(".entry-header h1::text").extract()[0]
        # 提取日期
        create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", " ").strip()
        # 提取点赞数
        praise_nums = response.css(".vote-post-up h10::text").extract()[0]

        # 提取收藏数
        fav_nums = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        # 提取评论数
        comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        match_re = re.match(".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        # 提取主体文章
        content = response.css("div.entry").extract()[0]

        tags = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        tags = [element for element in tags if not element.strip().endswith("评论")]
        tags = ",".join(tags)

        # url转md5
        article_item["url_object_id"] = get_md5(response.url)

        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y%m%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]

        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content
        '''

        # 使用item_loader

        # 文章封面图
        front_image_url = response.meta.get("front_image_url", "")
        # 通过itemloader加载item
        # 使用自定义的ArticleItemLoader
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        # 处理为md5格式
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        # item_loader.add_xpath()
        # item_loader.add_value()

        article_item = item_loader.load_item()

        yield article_item
Пример #17
0
    def parse_detail(self,response):
        article_item = JobBoleArticleItem()
        # 提取文章的具体字段
        # /html/body/div[1]/div[3]/div[1]/div[1]/h1
        # //*[@id="post-114167"]/div[1]/h1
        """
        re_selector = response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1")
        re1_selector = response.xpath('//*[@id="post-114167"]/div[1]/h1/text()')
        re2_selector = response.xpath('//div[@class="entry-header"]/h1/text()')
        re3_selector = response.css(".entry-header h1::text").extract()
        create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()
        praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first(""))
        fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0]
        match_re = re.match(".*?(\d+).*",fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        match_re = re.match(".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        content = response.xpath("//div[@class='entry']").extract()[0]

        tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        tags = ",".join(tag_list)
        """
        """
        # 通过css选择器提取字段
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        title = response.css(".entry-header h1::text").extract()
        create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "").strip()
        praise_nums = int(response.css(".vote-post-up h10::text").extract()[0])
        fav_nums = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comment_nums = response.css("a[href='#article-comment']::text").extract_first("")
        match_re = re.match(".*?(\d+).*", comment_nums)
        if match_re:
            comment_nums = int(match_re.group(1))
        else:
            comment_nums = 0

        content = response.css("div.entry").extract()[0]

        tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract()
        tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        tags = ",".join(tag_list)

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item["create_date"] = create_date
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["comment_nums"] = comment_nums
        article_item["fav_nums"] = fav_nums
        article_item["tags"] = tags
        article_item["content"] = content
        """


        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item = JobBoleArticleItem(),response = response)
        item_loader.add_css("title", ".entry-header h1::text")
        # item_loader.add_xpatn()
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        # item_loader.add_css("comment_nums", "a[href='#article-comment']::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")

        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", ".entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield  article_item
Пример #18
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        # 文章封面图
        # front_image_url = response.meta.get("front_image_url", "")
        # re_selector = response.xpath("//*[@id='post-114397']/div[1]/h1/text()").extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","")
        # title = response.css(".entry-header h1::text").extract()[0]
        # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]
        # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.xpath("//div[@class='entry']").extract()[0]
        #
        # tags_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        #
        # tags_list = [element for element in tags_list if not element.strip().endswith("评论")]
        #
        # tags = ",".join(tags_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["url"] = response.url
        # article_item["title"] = title
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y%m%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        #
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["front_image_path"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过ItemLoader 加载item
        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_value("front_image_path", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        article_item = item_loader.load_item()
        yield article_item
Пример #19
0
    def parse_detail(self, response):
        #提取文章的具体字段,通过xpath提取
        # title = response.xpath('//*[@class="grid-8"]/div[1]/div[1]/h1/text()').extract_first()
        # get_date = response.xpath("//*[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip()
        # praise_nums = response.xpath("//*[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        #
        # fav_nums = response.xpath("//*[contains(@class, 'bookmark-btn')]/text()").extract()[0].strip()
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        #
        # comments_nums = response.xpath("//a[contains(@href, '#article-comment')]/span/text()").extract()[0].strip()
        # match_re = re.match(".*?(\d+).*", comments_nums)
        # if match_re:
        #     comments_nums = match_re.group(1)
        #
        # content = response.xpath("//div[@class='entry']").extract()[0]
        # tag_list = response.xpath("//*[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [num for num in tag_list if not num.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        article_item = JobBoleArticleItem()

        #通过CSS选择器提取字段
        front_image_url = response.meta.get("front_image_url", "")
        title = response.css(".entry-header h1::text").extract()[0]
        create_data = response.css(
            "p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace(
                "·", "").strip()
        praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        fav_nums = response.css(".bookmark-btn::text").extract()[0]
        match_re = re.match(".*?(\d+).*", fav_nums)
        if match_re:
            fav_nums = int(match_re.group(1))
        else:
            fav_nums = 0

        comments_nums = response.css(".hide-on-480::text").extract()[2].strip()
        match_re = re.match(".*?(\d+).*", comments_nums)
        if match_re:
            comments_nums = int(match_re.group(1))
        else:
            comments_nums = 0

        content = response.css("div.entry").extract()[0]
        tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract()
        tag_list = [num for num in tag_list if not num.strip().endswith("评论")]
        tags = ",".join(tag_list)

        article_item["url_object_id"] = get_md5(response.url)
        article_item["title"] = title
        article_item["url"] = response.url
        article_item["create_date"] = create_data
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = praise_nums
        article_item["fav_nums"] = fav_nums
        article_item["comments_nums"] = comments_nums
        article_item["tags"] = tags
        article_item["content"] = content

        yield article_item

        pass
Пример #20
0
    def parse_detail(self, response):

        article_item = JobBoleArticleItem()
        # 提取文章具体字段

        # /html/body/div[3]/div[3]/div[1]/div[1]
        # // *[ @ id="post-113737"] / div[1] / h1
        # re_selector = response.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/h1')
        # re2_selector=response.xpath('//*[@id="post-113737"]/div[1]/h1/text()')
        # re3_selector=response.xpath('//div[@class="entry-header"]/h1/text()')

        # 标题、日期
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first('')
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace('·','').strip()
        # # 点赞数、收藏数、评论数
        # prase_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0])
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match('.*?(\d+).*', fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comments_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re=re.match('.*?(\d+).*', comments_nums)
        # if match_re:
        #     comments_nums = int(match_re.group(1))
        # else:
        #     comments_nums = 0
        # # 内容
        # content = response.xpath("//div[@class='entry']").extract()[0]
        # # 标签
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ','.join(tag_list)

        # 通过css选择器提取字段
        # front_image_url = response.meta.get('front_image_url', '')  # 文章封面图
        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace('·','').strip()
        # try:
        #     create_date = datetime.datetime.strptime(create_date, '%Y%m%d').date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # prase_nums = response.css(".vote-post-up h10::text").extract()[0]
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re=re.match('.*?(\d+).*', fav_nums)
        # if match_re:
        #     fav_nums= int(match_re.group(1))
        # else:
        #     fav_nums=0
        # comments_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re=re.match('.*?(\d+).*', comments_nums)
        # if match_re:
        #     comments_nums = int(match_re.group(1))
        # else:
        #     comments_nums=0
        # content = response.css("div.entry").extract()[0]
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list=[element for element in tag_list if not element.strip().endswith("评论")]
        # tags=','.join(tag_list)
        #
        # article_item['title'] = title
        # article_item['url'] = response.url
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['create_date'] = create_date
        # article_item['front_image_url'] = [front_image_url]
        # article_item['prase_nums'] = prase_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['comments_nums'] = comments_nums
        # article_item['content'] = content
        # article_item['tags'] = tags
        # # front_image_path在pipelines.py中填充

        # 通过ItemLoader加载item
        front_image_url = response.meta.get('front_image_url', '')  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text')
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('prase_nums', '.vote-post-up h10::text')
        item_loader.add_css('fav_nums', '.bookmark-btn::text')
        item_loader.add_css('comments_nums',
                            "a[href='#article-comment'] span::text")
        item_loader.add_css('content', 'div.entry')

        article_item = item_loader.load_item()

        yield article_item
Пример #21
0
    def parse_detail(self, response):
        """
        # 实例化一个jobboleitem
        article_item = JobBoleArticleItem()

        # 获取meta,获取到Request的封面图提取出来
        front_image_url = response.meta.get('front_image_url', '')

        --------------    css   案例 start    --------------
        # 标题  extract_first()防止数组越界
        article_title_css = response.css('div.entry-header h1::text').extract_first('')

        # 时间
        article_time_css = response.css('p.entry-meta-hide-on-mobile::text').extract_first('').strip().replace(
            '·', '').strip()

        # 点赞数
        article_praise_css = response.css('.vote-post-up h10::text').extract_first('')
        # 正则提取收藏数字
        match_article_praise_css = re.match('.*(\d+).*', article_praise_css)
        if match_article_praise_css:
            article_praise_css = int(match_article_praise_css.group(1))
        else:
            article_praise_css = 0

        # 收藏数
        bookmark_css = response.css(
            '.btn-bluet-bigger.href-style.bookmark-btn.register-user-only::text').extract_first('')
        # 正则提取收藏数字
        match_bookmark_css = re.match('.*(\d+).*', bookmark_css)
        if match_bookmark_css:
            article_bookmark_css = int(match_bookmark_css.group(1))
        else:
            article_bookmark_css = 0

        # 评论数
        comments_css = response.css('a[href="#article-comment"] span::text').extract_first('')
        match_comments_css = re.match('.*(\d+).*', comments_css)
        if match_comments_css:
            article_comments_css = int(match_comments_css.group(1))
        else:
            article_comments_css = 0
        # 文章详情
        article_contents_css = response.css('.entry').extract_first('')

        # 文章标签
        tag_list_css = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        # 去重标签
        tag_list_css = [element for element in tag_list_css if not element.strip().endswith("评论")]
        tags_css = ','.join(tag_list_css)

        --------------    css   案例 end    --------------

        article_item["title"] = article_title_css
        # 将字符串时间转为日期
        try:
            article_time_css = datetime.datetime.strptime(article_time_css, '%Y/%m%d').date()
        except Exception as e:
            article_time_css = datetime.datetime.now().date()
        article_item["create_date"] = article_time_css
        article_item["url"] = response.url
        article_item["url_object_id"] = get_md5(response.url)
        article_item["front_image_url"] = [front_image_url]
        article_item["praise_nums"] = article_praise_css
        article_item["comments_nums"] = article_comments_css
        article_item["fav_nums"] = article_bookmark_css
        article_item["tags"] = tags_css
        article_item["content"] = article_contents_css
        """
        ''' 通过item_loader加载item,目的:比原来的item便于维护 start'''
        # 获取meta,获取到Request的封面图提取出来
        front_image_url = response.meta.get('front_image_url', '')
        item_loader = ArticleItemLoad(item=JobBoleArticleItem(),
                                      response=response)
        item_loader.add_css("title", "div.entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", 'p.entry-meta-hide-on-mobile::text')
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", '.vote-post-up h10::text')
        item_loader.add_css("comments_nums",
                            'a[href="#article-comment"] span::text')
        item_loader.add_css(
            "fav_nums",
            '.btn-bluet-bigger.href-style.bookmark-btn.register-user-only::text'
        )
        item_loader.add_css("tags", 'p.entry-meta-hide-on-mobile a::text')
        item_loader.add_css("content", '.entry')

        # 必须调用此步骤
        article_item = item_loader.load_item()
        ''' 通过item_loader加载item,目的:比原来的item便于维护 end'''

        yield article_item