Пример #1
0
    def parse_details(self, response):
        """
        提取具体字段
        """
        # # 通过CSS选择器提取文章的具体字段,并添加到item中
        # title = response.css('.entry-header h1::text').extract_first()
        # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·', '').strip()
        # # 数据库里定义的是date对象,所以这里要处理一下
        # try:
        #     create_date = self.pares_ymd(create_date)
        # except Exception as e:
        #     create_date = datetime.now().date()
        # tag = response.css('.entry-meta-hide-on-mobile a::text').extract()[-1]
        # front_image_url = response.meta.get("front_image_url", "")
        # content = response.css("div.entry").extract_first()
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # # item对应字段填充值
        # article_item = JobBoleArticleItem()
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["create_date"] = create_date
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["tag"] = tag
        # article_item["front_image_url"] = [front_image_url]
        # article_item["content"] = content
        # article_item["fav_nums"] = fav_nums
        # article_item["front_image_path"] = " "

        # 通过item loader加载item
        # 文章封面图
        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text")
        item_loader.add_css("tag", ".entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        article_item = item_loader.load_item()
        # 调用后传递到pipelines.py
        yield article_item
Пример #2
0
 def parse_detail(self, response):
     article_item = QiushibaikeItem()
     # 通过item loader加载item
     front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
     flagTrue = response.meta.get("flag", "")  # 标识
     original = "http://www.tuicool.com/" + response.css("span.from a::attr(href)").extract_first("")
     item_loader = ArticleItemLoader(item=QiushibaikeItem(), response=response)
     item_loader.add_css("title", ".article_row_fluid div:nth-child(1) h1::text")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_css("create_date", "span.timestamp::text")
     item_loader.add_value("front_image_url", [front_image_url])
     item_loader.add_value("sites", original)
     item_loader.add_value("flag", flagTrue)
     item_loader.add_css("original", "div.source a::text")
     item_loader.add_css("tags", "span.new-label::text")
     item_loader.add_css("content", "div.article_body")
     article_item = item_loader.load_item()
     yield article_item
Пример #3
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()

        # xpath选择器,提取文章的具体字段
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()
        # creat_data = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace("·", "")
        # praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first()
        # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn ")]/text()').extract_first()
        # match_re = re.match(".*?(\d+).*", "fav_nums")
        # if match_re:
        #     fav_nums = match_re.group(1)
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first()
        # match_re = re.match(".*?(\d+).*", "comment_nums")
        # if match_re:
        #     comment_nums = match_re.group(1)
        # content = response.xpath('//div[@class="entry"]').extract_first()
        # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # # endswith() 方法用于判断字符串是否以指定后缀结尾,如果以指定后缀结尾返回True,否则返回False
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)  # .join() 连接字符串数组,将字符串、元组、列表中的元素以指定的字符(分隔符)连接生成一个新的字符串

        # # css选择器
        # title = response.css('.entry-header h1::text').extract_first("")
        # create_data = response.css('p.entry-meta-hide-on-mobile ::text').extract_first("").strip().replace("·", "")
        # praise_nums = response.css('.vote-post-up h10::text').extract_first("")
        # fav_nums = response.css('.bookmark-btn ::text').extract_first("")
        # match_re = re.match(".*?(\d+).*", "fav_nums")
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.css('a[href="#article-comment"] ::text').extract_first("")
        # match_re = re.match(".*?(\d+).*", "comment_nums")
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css('.entry').extract_first("")
        # tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["url_object_id"] = get_md5(response.url)
        # # try:
        # #     create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date()
        # # except Exception as e:
        # #     create_data = datetime.datetime.now().date()
        # article_item["create_data"] = create_data
        # article_item["praise_nums"] = praise_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["front_image_url"] = [front_image_url]
        # article_item["comment_nums"] = comment_nums
        # article_item["content"] = content
        # article_item["tags"] = tags

        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_data", "p.entry-meta-hide-on-mobile ::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn ::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] ::text")
        item_loader.add_css("tags", ".entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", ".entry")
        article_item = item_loader.load_item()
        yield article_item
Пример #4
0
    def parse_detail(self, response):
        article_item = JobboleArticleItem()

        # title = response.css('div.entry-header > h1::text').extract_first()
        # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·','').strip()
        # praise_num = response.css('.post-adds .vote-post-up h10::text').extract_first(0)
        front_img_url = response.meta.get('front_img_url', '')
        #
        # fav_num_info = response.css('.post-adds .bookmark-btn::text').extract_first()
        # fav_num_re = re.match(".*(\d+).*", fav_num_info)
        # if fav_num_re:
        #     fav_num = fav_num_re.group(1)
        # else:
        #     fav_num = 0
        # comment_num_info = response.css('a[href="#article-comment"] span::text').extract_first()
        # comment_num_re = re.findall("\d+",comment_num_info)
        # if comment_num_re:
        #     comment_num = comment_num_re[0]
        # else:
        #     comment_num = 0
        #
        # tag_list = response.css('.entry-meta .entry-meta-hide-on-mobile a::text').extract()
        # tags = ','.join([tag for tag in set(tag_list) if not tag.strip().endswith('评论')])
        # content = response.css('.entry').extract_first()
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['url'] = response.url
        # article_item['title'] = title
        # try:
        #     create_date = datetime.strptime(create_date,'%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.now()
        # article_item['create_date'] = create_date
        # article_item['praise_num'] = praise_num
        # article_item['fav_num'] = fav_num
        # article_item['comment_num'] = comment_num
        # article_item['front_img_url'] = [front_img_url]
        # article_item['tags'] = tags
        # article_item['content'] = content

        #通过item loader价值item
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_css('title', 'div.entry-header > h1::text')
        item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text')
        item_loader.add_css('praise_num', '.post-adds .vote-post-up h10::text')
        item_loader.add_css('fav_num', '.post-adds .bookmark-btn::text')  #re
        item_loader.add_css('comment_num',
                            'a[href="#article-comment"] span::text')  #re
        item_loader.add_css(
            'tag', '.entry-meta .entry-meta-hide-on-mobile a::text')  #处理函数
        item_loader.add_css('content', '.entry')
        item_loader.add_value('front_img_url', [front_img_url])
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))

        article_item = item_loader.load_item()

        yield article_item
Пример #5
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()

        #提取文章的具体字段
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        #
        # content = response.xpath("//div[@class='entry']").extract()[0]
        #
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        #通过css选择器提取字段
        # front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract()[0]
        #
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        #通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Пример #6
0
    def parse_detail(self, response):
        '''
        解析文章详情
        :param response:
        :return:
        '''

        # 实例化item
        article_item = JobBoleArticleItem()
        # 使用xpath来获取数据
        # #// *[ @ id = "post-114041"] / div[1] / h1
        # title = response.xpath("//*[@id='post-114041']/div/h1/text()").extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first("").strip().replace("·","").strip()
        # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first("")
        # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first("")
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        # comment_nums = response.xpath("//a[href='#article-comment']/span/text()").extract_first("")
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        # content = response.xpath("//div[class='entry']").extract()extract_first("")
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract_first("")
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        # 使用css选择器来获取数据

        # extract_first函数:extract返回的是一个数组,数组有可能为空,所以取第0个值得时候会报错,调用这个函数可以在取不到值得时候给一个默认值
        # title = response.css(".entry-header h1::text").extract_first("")
        # create_date = response.css(".entry-meta-hide-on-mobile::text").extract_first("").strip().replace("·","").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract_first("")
        # fav_nums = response.css(".bookmark-btn::text").extract_first("")
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first("")
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.css("div.entry").extract_first("")
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        front_image_url = response.meta.get("front_image_url", "")

        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["front_image_path"]
        #通过item loader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text")
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])

        article_item = item_loader.load_item()
        yield article_item
    def parse_detail(self, response):
        #提取文章具体字段(xpath)
#         title = response.xpath('//*[@id="post-113789"]/div[1]/h1/text()').extract()[0]
#         
#         create_date = response.xpath('//*[@id="post-113789"]/div[2]/p/text()[1]').extract()[0].strip().replace('·', '').strip()
#         
#         praise_nums = response.xpath('//*[@id="113789votetotal"]/text()').extract()
#         if praise_nums:
#             praise_nums = int(praise_nums[0])
#         else:
#             praise_nums = 0
#         
#         fav_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/span[2]/text()').extract()[0]
#         match_re = re.match(r'.*?(\d+).*', fav_nums)
#         if match_re:
#             fav_nums = int(match_re.group(1))
#         else:
#             fav_nums = 0
#         
#         comment_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/a/span/text()').extract()[0]
#         match_re = re.match(r'.*?(\d+).*', comment_nums)
#         if match_re:
#             comment_nums = int(match_re.group(1))
#         else:
#             comment_nums = 0
#         
#         content = response.xpath('//*[@id="post-113789"]/div[3]').extract()[0]
#         
#         tag_list = response.xpath('//*[@id="post-113789"]/div[2]/p/a/text()').extract()
#         tag_list = [element for element in tag_list if not element.strip().endswith('评论')] 
#         tags = ','.join(tag_list)
        
        
        
        #以下通过css选择器提取字段
#         article_item = JobboleArticleItem()  #实例化
#         
#         front_image_url = response.meta.get('front_image_url', '')  #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空)
#         #文章封面图
#         
#         title = response.css('.entry-header h1::text').extract()[0]
#         
#         create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·', '').strip()
#         
#         praise_nums = response.css('.vote-post-up h10::text').extract_first()
#         if praise_nums:
#             praise_nums = int(praise_nums[0])
#         else:
#             praise_nums = 0
#         
#         fav_nums = response.css('.bookmark-btn::text').extract()[0]
#         match_re = re.match(r'.*?(\d+).*', fav_nums)
#         if match_re:
#             fav_nums = int(match_re.group(1))
#         else:
#             fav_nums = 0
#         
#         comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
#         match_re = re.match(r'.*?(\d+).*', comment_nums)
#         if match_re:
#             comment_nums = int(match_re.group(1))
#         else:
#             comment_nums = 0
#         
#         content = response.css("div.entry").extract()[0]
#         
#         tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
#         tag_list = [element for element in tag_list if not element.strip().endswith('评论')] 
#         tags = ','.join(tag_list)
#         
#         #填充值到items
#         article_item['title'] = title
#         article_item['url'] = response.url
#         article_item['url_object_id'] = get_md5(response.url)  #对url做MD5
#         
#         try:  #为了将文章的创建时间写入数据库,要把str类型的create_time转换为date类型
#             create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date()  #将格式为%Y/%m/%d 的str类型转换为date类型
#         except Exception as e:
#             create_date = datetime.datetime.now().date()
#         article_item['create_date'] = create_date
#         
#         article_item['front_image_url'] = [front_image_url]  #images需要接受一个数组
#         article_item['praise_nums'] = praise_nums
#         article_item['fav_nums'] = fav_nums
#         article_item['comment_nums'] = comment_nums
#         article_item['tags'] = tags
#         article_item['content'] = content
        
        #通过itemLoader加载item
        front_image_url = response.meta.get('front_image_url', '')  #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空)
        #item_loader = ItemLoader(item=JobboleArticleItem(), response=response)  #定义ItemLoader实例
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)  #改用自定义的 ItemLoader
#         ItemLoader.add_css(self, field_name, css)
#         ItemLoader.add_xpath(self, field_name, xpath)
#         ItemLoader._add_value(self, field_name, value)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        
        article_item = item_loader.load_item()
        #调用默认的item方法的话会有两个问题:1.值都是list 2.还需要对取出的值行进处理(做re的提取等)
        #-->去修改items.py  #1.在items.py 的Field()里面用TakeFirst进行处理  2.在items.py 的Field()里面用MapCompose进行处理
        
        yield article_item  #调用yield之后,item会传递到pipelines.py
Пример #8
0
 def parse_detail(self, response):
   item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
   # 通过item loader加载item
   front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
   item_loader.add_css("title", ".entry-header h1::text")
   item_loader.add_value("url", response.url)
   item_loader.add_value("url_object_id", cutils.get_md5(response.url))
   item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
   item_loader.add_value("front_image_url", [front_image_url])
   item_loader.add_css("praise_nums", ".vote-post-up h10::text")
   item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
   item_loader.add_css("fav_nums", ".bookmark-btn::text")
   item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
   item_loader.add_css("content", "div.entry")
   yield item_loader.load_item()
Пример #9
0
    def parse_question(self, response):
        zhihu_id = response.meta.get("question_id", "")
        question_item_loader = ArticleItemLoader(item=ZhihuQuestionItem(),
                                                 response=response)
        question_item_loader.add_css("title", "h1.QuestionHeader-title::text")
        question_item_loader.add_css("content", ".QuestionHeader-detail")
        question_item_loader.add_value("url", response.url)
        question_item_loader.add_value("zhihu_id", zhihu_id)
        question_item_loader.add_css("answer_num",
                                     ".List-headerText span::text")
        question_item_loader.add_css("comments_num",
                                     ".QuestionHeader-Comment button::text")
        question_item_loader.add_css("watch_user_num",
                                     ".NumberBoard-itemValue::text")
        question_item_loader.add_css(
            "topics", ".QuestionHeader-topics .Popover div::text")
        question_item = question_item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(zhihu_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)

        yield question_item
Пример #10
0
    def parse_content(self, response):
        # 通过css选择器提取数据
        # front_image_url = response.meta.get("front_image_url", "") #文章封面图
        # title = response.css('.entry-header h1::text').extract_first()
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace("·","").strip()
        # praise_num = response.css('.vote-post-up h10::text').extract_first() #点赞数
        # fav_num = response.css('.bookmark-btn::text').extract_first() #收藏数
        # match_re = re.match(".*?(\d+).*", fav_num)
        # if match_re:
        #     fav_num = int(match_re.group(1))
        # else:
        #     fav_num = 0
        # comments_num = response.css('a[href="#article-comment"] span::text').extract_first() # 评论数
        # match_re = re.match(".*?(\d+).*", comments_num)  # 正则获取字符串中的数字
        # if match_re:
        #     comments_num = int(match_re.group(1))
        # else:
        #     comments_num = 0
        # content = response.css('div.entry').extract_first() # 正文
        # tag_selecter = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        # tag_list = [element for element in tag_selecter if not element.strip().endswith('评论')]
        # tags = ",".join(tag_list)  # 标签
        #
        # article_item = JobboleArticleItem()
        # article_item["title"] = title
        # try:
        #     create_date = datetime.strptime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["url"] = response.url
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_num
        # article_item["comment_nums"] = comments_num
        # article_item["fav_nums"] = fav_num
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过item loader加载item  使用自定义的loader:ArticleItemLoader 由list变成str
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response = response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Пример #11
0
    def parse_detail(self, response):

        # 通过itemloder
        front_image_url = response.meta.get('front_image_url', '')
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('praise_num', '.vote-post-up h10::text')
        item_loader.add_css('fav_num', '.bookmark-btn::text')
        item_loader.add_css('com_num', "a[href='#article-comment'] span::text")
        item_loader.add_css('content', 'div.entry')
        item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text')

        article_item = item_loader.load_item()

        yield article_item
Пример #12
0
    def parse_detail(self, response):
        """
        提取文章的具体字段, 回调函数
        :param response:
        :return:
        """
        article_item = JobBoleArticleItem()
        # 文章封面图
        front_image_url = response.meta.get("front_image_url", "")

        """
        '''
        # 方法一:【通过XPath提取字段】
        # 可以在浏览器inspect html里copy Xpath
        # chrome return 和 firefox return 可能不一样。 有时直接copy的值无法获得数据, 因为获取的是动态html而不是原始html
        
        # 标题
        # extract_first() 就是 extract()[0],还可以传一个default值
        title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # 创建时间
        create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first("").strip().replace("·", "").strip()
        # 点赞数
        vote_numbers = int(response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract_first(""))
        # 收藏数
        bookmark_numbers = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract_first("")
        match_re = re.match(r".*?(\d+).*", bookmark_numbers)
        if match_re:
            bookmark_numbers = match_re.group(1)
        # 评论数
        comment_numbers = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("")
        match_re = re.match(r".*?(\d+).*", comment_numbers)
        if match_re:
            comment_numbers = match_re.group(1)
        # 正文 (不提取text而是提取整个html结构)
        content = response.xpath("//div[@class='entry']").extract_first("")
        # 标签
        tags = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        tags = ','.join([item for item in tags if not item.strip().endswith('评论')])
        '''
        # 方法二:【通过CSS选择器提取字段】
        # 标题
        # ::text 代表去text
        title = response.css(".entry-header h1::text").extract_first("")
        # 创建时间
        create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first("").strip().replace("·","").strip()
        # 点赞数
        vote_numbers = int(response.css(".vote-post-up h10::text").extract_first(""))
        # 收藏数
        bookmark_numbers = response.css(".bookmark-btn::text").extract_first("")
        match_re = re.match(r".*?(\d+).*", bookmark_numbers)
        if match_re:
            bookmark_numbers = int(match_re.group(1))
        else:
            bookmark_numbers = 0
        # 评论数
        comment_numbers = response.css("a[href='#article-comment'] span::text").extract_first("")
        match_re = re.match(r".*?(\d+).*", comment_numbers)
        if match_re:
            comment_numbers = int(match_re.group(1))
        else:
            comment_numbers = 0
        # 正文 (不提取text而是提取整个html结构)
        content = response.css("div.entry").extract_first("")
        # 标签
        tags = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        tags = ','.join([item for item in tags if not item.strip().endswith('评论')])

        # 赋值生成item
        article_item['url_object_id'] = get_md5(response.url)
        article_item['title'] = title
        article_item['url'] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item['create_date'] = create_date
        # scrapy 的image下载接受的是数组
        article_item['front_image_url'] = [front_image_url]
        article_item['vote_numbers'] = vote_numbers
        article_item['bookmark_numbers'] = bookmark_numbers
        article_item['comment_numbers'] = comment_numbers
        article_item['tags'] = tags
        article_item['content'] = content
        """

        # 通过item loader 加载 item
        # 更加简洁可配置
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("vote_numbers", ".vote-post-up h10::text")
        item_loader.add_css("bookmark_numbers", ".bookmark-btn::text")
        item_loader.add_css("comment_numbers", "a[href='#article-comment'] span::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Пример #13
0
    def parse_job(self, response):
        item_loader = ArticleItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css("title", ".job-name span::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("salary", ".salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")

        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_css("company_name", "#job_company dt a div h2::text")

        job_item = item_loader.load_item()

        return job_item