Пример #1
0
    def parse_job(self, response):
        item_loader = ArticleItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css("title", ".job-name span::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("salary", ".salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")

        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_css("company_name", "#job_company dt a div h2::text")

        job_item = item_loader.load_item()

        return job_item
Пример #2
0
    def get_detail_use_item_loader(self, response):
        '''
            使用item_loader,这里得到的字段是列表
        :return: 
        '''
        article_item = JobBoleArticleItem()
        item_loader = ArticleItemLoader(item=article_item, response=response)

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader.add_value("front_image_url", [front_image_url])

        item_loader.add_xpath("title",
                              "//div[@class = 'entry-header']/h1/text()")  # 标题
        item_loader.add_xpath("create_date",
                              "//div[@class='entry-meta']/p/text()")
        item_loader.add_xpath("praise_nums",
                              "//div[@class='post-adds']//h10/text()")  # 点赞数
        item_loader.add_xpath(
            "fav_nums", "//div[@class='post-adds']/span[2]/text()")  # 收藏数
        item_loader.add_xpath(
            "comment_nums",
            "//span[@class='btn-bluet-bigger href-style hide-on-480']/text()"
        )  # 评论数
        item_loader.add_xpath("content", "//div[@class='entry']")  # 内容
        item_loader.add_xpath(
            "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")  # 内容

        article_item = item_loader.load_item()

        yield article_item
Пример #3
0
 def parse(self, response):
     self.brower.get(response.url)
     self.brower.execute_script("window.scrollTo(0,250)")
     for i in range(1,16):
             selector=self.brower.find_element_by_xpath("html/body/div[5]/div[1]/div[1]/div/ul/li[{0}]".format(i))
             ActionChains(self.brower).move_to_element(selector).perform()
     """
     1、获取当前页面
     2、获取到下一页的Url 并交给scrapy下载 下载完成后交给Parse
     """
     selectorRepnse=Selector(text=self.brower.page_source)
     #self.brower.quit()
     items_pop = selectorRepnse.css(".cate_detail_item")
     for items in items_pop:
         #主分类
         pro = items.css(".cate_detail_tit_lk")
         pro_name = pro.css("::text").extract_first("")
         pro_url = parse.urljoin(response.url, pro.css("::attr(href)").extract_first(""))
         links = items.css(".cate_detail_con_lk")
         for link in links:
             #次级分类
             name = link.css("::text").extract_first("")
             url = parse.urljoin(response.url,link.css("::attr(href)").extract_first(""))
             #填充Item
             item_loader = ArticleItemLoader(item=JDIndexItem(), response=response)
             item_loader.add_value("index_name",str(name))
             item_loader.add_value("url",str(url))
             item_loader.add_value("pro_name",str(pro_name))
             item_loader.add_value("pro_url",str(pro_url))
             article_item = item_loader.load_item()
             yield article_item
             yield Request(url= url, meta={"name": name}, callback=self.parse_detail, dont_filter=True)
Пример #4
0
 def parse_detail(self, response):
     self.brower.get(response.url)
     self.brower.execute_script("window.scrollTo(0,document.body.scrollHeight-1000)")
     import time
     time.sleep(2)
     selecter=Selector(text=self.brower.page_source)
     #解析详细 页
     sort_name = response.meta.get("name", "")
     # 通过Itemloder加载item
     detail_items = selecter.css(".gl-i-wrap.j-sku-item")
     for detail_item in detail_items:
        uid = uuid.uuid4()
        item_loder=ArticleItemLoader(item=JDDetailItem(),response=response)
        url=parse.urljoin(response.url,detail_item.css("div.gl-i-wrap.j-sku-item div.p-img a::attr(href)").extract_first())
        price=detail_item.css("div.p-price strong i::text").extract_first("")
        name=detail_item.css("div.p-name em::text").extract_first("")
        commit=detail_item.css("div.p-commit span.buy-score em::text").extract_first("暂无推荐指数")
        type=detail_item.css("div.p-icons.J-pro-icons i::text").extract_first("")
        shopname=detail_item.css("div.p-shop span a::text").extract_first("")
        item_loder.add_value("uid",uid)
        item_loder.add_value("url",url)
        item_loder.add_value("price",price)
        item_loder.add_value("jdname",name)
        item_loder.add_value("jdcommit",commit)
        item_loder.add_value("jdtype",type)
        item_loder.add_value("shopname",shopname)
        item_loder.add_value("sort_name",sort_name)
        yield item_loder.load_item()
        yield Request(url=url, meta={"detail_url":url,"uid":uid}, callback=self.parse_comment, dont_filter=True)
     next_url= parse.urljoin(response.url, response.css(".pn-next::attr(href)").extract_first(""))
     if next_url != "":
         yield Request(url=next_url, callback=self.parse_detail, meta={"name": sort_name}, dont_filter=True)
Пример #5
0
    def parse_detail(self, response):

        #article_item=JobBoleArticleItem()

        # article_item['title']=response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # #
        # # # 提取meta中的值,使用get方法遇到空的键值才不会报错,默认值为空,此处使用的是元祖非[]
        # # # image的url要改为数组,不然在使用自动下载器会报错,即setting中的IMAGES_URLS_FILELD
        # article_item['front_image_url']=[response.meta.get('front_image_url','')]
        # date_time=re.match('.*?(\d{4}/\d+/\d+).*',response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/text()[1]').extract()[0])
        # try:
        #     article_item['create_date']=datetime.datetime.strptime(date_time,'%Y/%m/%d').date()
        # except Exception as e:
        #     article_item['create_date']=datetime.datetime.now().date()
        # article_item['tag']=','.join(response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/a/text()').extract())
        # article_item['content']=''.join(response.xpath('//div[@class="entry"]/p/text()').extract())
        # article_item['praise_nums']=response.xpath('//div[@class="post-adds"]/span/h10/text()').extract()[0]
        # fav_num=response.xpath('//div[@class="post-adds"]/span[2]/text()').extract()[0]
        # match_re=re.match(".*?(\d+).*",fav_num)
        # if match_re:
        #     article_item['fav_nums']=match_re.group(1)
        # else:
        #     article_item['fav_nums']=0
        # comment_num=response.xpath('//div[@class="post-adds"]/a/span/text()').extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_num)
        # if match_re:
        #     article_item['comment_nums'] = match_re.group(1)
        # else:
        #     article_item['comment_nums']=0
        # article_item['url_object_id'] =common.get_md5(response.url)

        # 使用ItemLoader加载item
        #item_loader=ItemLoader(item=JobBoleArticleItem(),response=response)
        # 使用自定义ArticleItemLoader
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_xpath('title',
                              '//div[@class="entry-header"]/h1/text()')
        item_loader.add_xpath(
            'tag', '//div/p[@class="entry-meta-hide-on-mobile"]/a/text()')
        item_loader.add_value('front_image_url',
                              [response.meta.get('front_image_url', '')])
        item_loader.add_value('url', response.url)
        item_loader.add_xpath('content', '//div[@class="entry"]/p/text()')
        item_loader.add_xpath('praise_nums',
                              '//div[@class="post-adds"]/span/h10/text()')
        item_loader.add_xpath('comment_nums',
                              '//div[@class="post-adds"]/a/span/text()')
        item_loader.add_xpath('fav_nums',
                              '//div[@class="post-adds"]/span[2]/text()')
        item_loader.add_xpath(
            'create_date',
            '//div/p[@class="entry-meta-hide-on-mobile"]/text()[1]')
        item_loader.add_value('url_object_id', common.get_md5(response.url))

        article_item = item_loader.load_item()

        yield article_item
        pass
Пример #6
0
    def parse_detail(self, response):
        article_item = JobboleArticleItem()

        # title = response.css('div.entry-header > h1::text').extract_first()
        # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·','').strip()
        # praise_num = response.css('.post-adds .vote-post-up h10::text').extract_first(0)
        front_img_url = response.meta.get('front_img_url', '')
        #
        # fav_num_info = response.css('.post-adds .bookmark-btn::text').extract_first()
        # fav_num_re = re.match(".*(\d+).*", fav_num_info)
        # if fav_num_re:
        #     fav_num = fav_num_re.group(1)
        # else:
        #     fav_num = 0
        # comment_num_info = response.css('a[href="#article-comment"] span::text').extract_first()
        # comment_num_re = re.findall("\d+",comment_num_info)
        # if comment_num_re:
        #     comment_num = comment_num_re[0]
        # else:
        #     comment_num = 0
        #
        # tag_list = response.css('.entry-meta .entry-meta-hide-on-mobile a::text').extract()
        # tags = ','.join([tag for tag in set(tag_list) if not tag.strip().endswith('评论')])
        # content = response.css('.entry').extract_first()
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['url'] = response.url
        # article_item['title'] = title
        # try:
        #     create_date = datetime.strptime(create_date,'%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.now()
        # article_item['create_date'] = create_date
        # article_item['praise_num'] = praise_num
        # article_item['fav_num'] = fav_num
        # article_item['comment_num'] = comment_num
        # article_item['front_img_url'] = [front_img_url]
        # article_item['tags'] = tags
        # article_item['content'] = content

        #通过item loader价值item
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_css('title', 'div.entry-header > h1::text')
        item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text')
        item_loader.add_css('praise_num', '.post-adds .vote-post-up h10::text')
        item_loader.add_css('fav_num', '.post-adds .bookmark-btn::text')  #re
        item_loader.add_css('comment_num',
                            'a[href="#article-comment"] span::text')  #re
        item_loader.add_css(
            'tag', '.entry-meta .entry-meta-hide-on-mobile a::text')  #处理函数
        item_loader.add_css('content', '.entry')
        item_loader.add_value('front_img_url', [front_img_url])
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))

        article_item = item_loader.load_item()

        yield article_item
Пример #7
0
    def parse_content(self, response):
        # 通过css选择器提取数据
        # front_image_url = response.meta.get("front_image_url", "") #文章封面图
        # title = response.css('.entry-header h1::text').extract_first()
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace("·","").strip()
        # praise_num = response.css('.vote-post-up h10::text').extract_first() #点赞数
        # fav_num = response.css('.bookmark-btn::text').extract_first() #收藏数
        # match_re = re.match(".*?(\d+).*", fav_num)
        # if match_re:
        #     fav_num = int(match_re.group(1))
        # else:
        #     fav_num = 0
        # comments_num = response.css('a[href="#article-comment"] span::text').extract_first() # 评论数
        # match_re = re.match(".*?(\d+).*", comments_num)  # 正则获取字符串中的数字
        # if match_re:
        #     comments_num = int(match_re.group(1))
        # else:
        #     comments_num = 0
        # content = response.css('div.entry').extract_first() # 正文
        # tag_selecter = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        # tag_list = [element for element in tag_selecter if not element.strip().endswith('评论')]
        # tags = ",".join(tag_list)  # 标签
        #
        # article_item = JobboleArticleItem()
        # article_item["title"] = title
        # try:
        #     create_date = datetime.strptime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["url"] = response.url
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_num
        # article_item["comment_nums"] = comments_num
        # article_item["fav_nums"] = fav_num
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过item loader加载item  使用自定义的loader:ArticleItemLoader 由list变成str
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response = response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Пример #8
0
    def parse_details(self, response):
        """
        提取具体字段
        """
        # # 通过CSS选择器提取文章的具体字段,并添加到item中
        # title = response.css('.entry-header h1::text').extract_first()
        # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·', '').strip()
        # # 数据库里定义的是date对象,所以这里要处理一下
        # try:
        #     create_date = self.pares_ymd(create_date)
        # except Exception as e:
        #     create_date = datetime.now().date()
        # tag = response.css('.entry-meta-hide-on-mobile a::text').extract()[-1]
        # front_image_url = response.meta.get("front_image_url", "")
        # content = response.css("div.entry").extract_first()
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # # item对应字段填充值
        # article_item = JobBoleArticleItem()
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["create_date"] = create_date
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["tag"] = tag
        # article_item["front_image_url"] = [front_image_url]
        # article_item["content"] = content
        # article_item["fav_nums"] = fav_nums
        # article_item["front_image_path"] = " "

        # 通过item loader加载item
        # 文章封面图
        front_image_url = response.meta.get("front_image_url", "")
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text")
        item_loader.add_css("tag", ".entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        article_item = item_loader.load_item()
        # 调用后传递到pipelines.py
        yield article_item
Пример #9
0
 def parse_detail(self, response):
   item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
   # 通过item loader加载item
   front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
   item_loader.add_css("title", ".entry-header h1::text")
   item_loader.add_value("url", response.url)
   item_loader.add_value("url_object_id", cutils.get_md5(response.url))
   item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
   item_loader.add_value("front_image_url", [front_image_url])
   item_loader.add_css("praise_nums", ".vote-post-up h10::text")
   item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
   item_loader.add_css("fav_nums", ".bookmark-btn::text")
   item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
   item_loader.add_css("content", "div.entry")
   yield item_loader.load_item()
Пример #10
0
 def parse_comment(self, response):
     #上个商品的url
     detail_url = response.meta.get("detail_url", "")
     #上个商品的外键
     uid = response.meta.get("uid", "")
     self.brower.get(detail_url)
     self.brower.execute_script("window.scrollTo(0,1000)")
     self.brower.find_element_by_css_selector("li[data-anchor=\"#comment\"]").click()
     import time
     time.sleep(2)
     select=Selector(text=self.brower.page_source)
     #商品简介
     shopItems=select.css("ul.parameter2.p-parameter-list li")
     shopMes=""
     for shopItem in shopItems:
         text=shopItem.css("::text").extract_first()
         shopMes+=text+";"
     buySourse=select.xpath(".//*[@id='buy-rate']/a/text()").extract_first("无购买指数")
     #累计所占比例
     items=""
     buyItems=select.css("ul.filter-list li a")
     for index in range(len(buyItems)):
         items+=buyItems[index].css("::text").extract_first()
         items+=buyItems[index].css("em::text").extract_first()+";"
     #商品分数
     goodtext=select.css("div.comment-percent strong.percent-tit::text").extract_first("未获取到!")
     source=select.xpath(".//*[@id='comment']/div[2]/div[1]/div[1]/div/text()").extract_first("100")+"%"
     #商品评价分数
     shopSourse=goodtext+":"+source
     shopMessage=select.css("div.tag-list span")
     # 商品评价具体参数
     ShopParameter=""
     if shopMessage:
         for shopmess in shopMessage:
             ShopParameter+=shopmess.css("::text").extract_first()+";"
     else:
        ShopParameter = "暂无评论记录"
     #添加数据
     items_loder=ArticleItemLoader(item=JDCommentItem(),response=response)
     items_loder.add_value("uid",uid)
     items_loder.add_value("shopParams",shopMes)
     items_loder.add_value("buy_sourse",buySourse)
     items_loder.add_value("user_comment",items)
     items_loder.add_value("good_sourse",shopSourse)
     items_loder.add_value("user_comment_Detail",ShopParameter)
     yield items_loder.load_item()
Пример #11
0
 def parse_detail(self, response):
     article_item = QiushibaikeItem()
     # 通过item loader加载item
     front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
     flagTrue = response.meta.get("flag", "")  # 标识
     original = "http://www.tuicool.com/" + response.css("span.from a::attr(href)").extract_first("")
     item_loader = ArticleItemLoader(item=QiushibaikeItem(), response=response)
     item_loader.add_css("title", ".article_row_fluid div:nth-child(1) h1::text")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_css("create_date", "span.timestamp::text")
     item_loader.add_value("front_image_url", [front_image_url])
     item_loader.add_value("sites", original)
     item_loader.add_value("flag", flagTrue)
     item_loader.add_css("original", "div.source a::text")
     item_loader.add_css("tags", "span.new-label::text")
     item_loader.add_css("content", "div.article_body")
     article_item = item_loader.load_item()
     yield article_item
Пример #12
0
    def parse_detail(self, response):

        # 通过itemloder
        front_image_url = response.meta.get('front_image_url', '')
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('praise_num', '.vote-post-up h10::text')
        item_loader.add_css('fav_num', '.bookmark-btn::text')
        item_loader.add_css('com_num', "a[href='#article-comment'] span::text")
        item_loader.add_css('content', 'div.entry')
        item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text')

        article_item = item_loader.load_item()

        yield article_item
Пример #13
0
    def parse_question(self, response):
        zhihu_id = response.meta.get("question_id", "")
        question_item_loader = ArticleItemLoader(item=ZhihuQuestionItem(),
                                                 response=response)
        question_item_loader.add_css("title", "h1.QuestionHeader-title::text")
        question_item_loader.add_css("content", ".QuestionHeader-detail")
        question_item_loader.add_value("url", response.url)
        question_item_loader.add_value("zhihu_id", zhihu_id)
        question_item_loader.add_css("answer_num",
                                     ".List-headerText span::text")
        question_item_loader.add_css("comments_num",
                                     ".QuestionHeader-Comment button::text")
        question_item_loader.add_css("watch_user_num",
                                     ".NumberBoard-itemValue::text")
        question_item_loader.add_css(
            "topics", ".QuestionHeader-topics .Popover div::text")
        question_item = question_item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(zhihu_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)

        yield question_item
Пример #14
0
    def parse(self, response):
        if response.status != 200 or len(response.text) == 0:
            logger.error('Current response is invalid')
            return
        # 解析当前响应数据的标题
        ArticlePage = response.xpath(
            '//div[@class="grid-8"]/div[@class="post floated-thumb"]')

        if ArticlePage is not None:
            logger.info(
                'Current artticle list len is {0} and type is {1}'.format(
                    len(ArticlePage), type(ArticlePage)))

        # 解析
        for Article in ArticlePage:
            itemloader = ArticleItemLoader(response=response,
                                           item=LpythonspiderItem())

            articlethumb = Article.xpath(
                './div[@class="post-thumb"]/a/img/@src').extract()
            if articlethumb is None:
                logger.info('cur article no cover')
                itemloader.add_value('thumb', 'No cover')
            else:
                itemloader.add_value('thumb', articlethumb)
            articletitle = Article.xpath(
                './div[@class="post-meta"]/p/a[@class="archive-title"]/text()'
            ).extract()
            itemloader.add_value('title', articletitle)

            articledate = Article.xpath(
                './div[@class="post-meta"]/p/text()').extract()
            itemloader.add_value('date', articledate)

            articletype = Article.xpath(
                './div[@class="post-meta"]/p/a[@rel="category tag"]/text()'
            ).extract()
            itemloader.add_value('type', articletype)

            articlesummary = Article.xpath(
                './div[@class="post-meta"]/span[@class="excerpt"]/p/text()'
            ).extract()
            itemloader.add_value('summary', articlesummary)

            articlelink = Article.xpath(
                './div[@class="post-meta"]/p/a[@class="archive-title"]/@href'
            ).extract()
            itemloader.add_value('link', articlelink)

            articleobjectid = common.get_md5(articlelink)
            itemloader.add_value('object_id', articleobjectid)
            yield itemloader.load_item()

        if self.already_push_all_request is not True:
            page_list_html_a = response.xpath(
                '//div[@class="grid-8"]/div[@class="navigation margin-20"]/a[@class="page-numbers"]'
            )
            last_page_list_html_a = page_list_html_a[-1]
            last_page_index = last_page_list_html_a.xpath(
                'text()').extract_first()
            print(type(last_page_index))
            last_index_number = int(last_page_index)
            print last_index_number
            format_url = 'http://python.jobbole.com/all-posts/page/{0}/'
            next_page_index = 2
            while next_page_index <= last_index_number:
                next_page_request_url = format_url.format(next_page_index)
                print(' will lpush to redis and url is %s' %
                      next_page_request_url)
                yield Request(url=next_page_request_url)
                next_page_index += 1
            self.already_push_all_request = True
Пример #15
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()

        #提取文章的具体字段
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]
        # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        #
        # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        #
        # content = response.xpath("//div[@class='entry']").extract()[0]
        #
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        #通过css选择器提取字段
        # front_image_url = response.meta.get("front_image_url", "")  #文章封面图
        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css("div.entry").extract()[0]
        #
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        #通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Пример #16
0
    def parse_detail(self, response):
        '''
        解析文章详情
        :param response:
        :return:
        '''

        # 实例化item
        article_item = JobBoleArticleItem()
        # 使用xpath来获取数据
        # #// *[ @ id = "post-114041"] / div[1] / h1
        # title = response.xpath("//*[@id='post-114041']/div/h1/text()").extract_first("")
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first("").strip().replace("·","").strip()
        # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first("")
        # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first("")
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = match_re.group(1)
        # comment_nums = response.xpath("//a[href='#article-comment']/span/text()").extract_first("")
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = match_re.group(1)
        # content = response.xpath("//div[class='entry']").extract()extract_first("")
        # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract_first("")
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        # 使用css选择器来获取数据

        # extract_first函数:extract返回的是一个数组,数组有可能为空,所以取第0个值得时候会报错,调用这个函数可以在取不到值得时候给一个默认值
        # title = response.css(".entry-header h1::text").extract_first("")
        # create_date = response.css(".entry-meta-hide-on-mobile::text").extract_first("").strip().replace("·","").strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract_first("")
        # fav_nums = response.css(".bookmark-btn::text").extract_first("")
        # match_re = re.match(".*?(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first("")
        # match_re = re.match(".*?(\d+).*", comment_nums)
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        # content = response.css("div.entry").extract_first("")
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)

        front_image_url = response.meta.get("front_image_url", "")

        # article_item["title"] = title
        # article_item["url"] = response.url
        # try:
        #     create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        # except Exception as e:
        #     create_date = datetime.datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["front_image_path"]
        #通过item loader加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text")
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("comment_nums",
                            "a[href='#article-comment'] span::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])

        article_item = item_loader.load_item()
        yield article_item
    def parse_detail(self, response):
        #提取文章具体字段(xpath)
#         title = response.xpath('//*[@id="post-113789"]/div[1]/h1/text()').extract()[0]
#         
#         create_date = response.xpath('//*[@id="post-113789"]/div[2]/p/text()[1]').extract()[0].strip().replace('·', '').strip()
#         
#         praise_nums = response.xpath('//*[@id="113789votetotal"]/text()').extract()
#         if praise_nums:
#             praise_nums = int(praise_nums[0])
#         else:
#             praise_nums = 0
#         
#         fav_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/span[2]/text()').extract()[0]
#         match_re = re.match(r'.*?(\d+).*', fav_nums)
#         if match_re:
#             fav_nums = int(match_re.group(1))
#         else:
#             fav_nums = 0
#         
#         comment_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/a/span/text()').extract()[0]
#         match_re = re.match(r'.*?(\d+).*', comment_nums)
#         if match_re:
#             comment_nums = int(match_re.group(1))
#         else:
#             comment_nums = 0
#         
#         content = response.xpath('//*[@id="post-113789"]/div[3]').extract()[0]
#         
#         tag_list = response.xpath('//*[@id="post-113789"]/div[2]/p/a/text()').extract()
#         tag_list = [element for element in tag_list if not element.strip().endswith('评论')] 
#         tags = ','.join(tag_list)
        
        
        
        #以下通过css选择器提取字段
#         article_item = JobboleArticleItem()  #实例化
#         
#         front_image_url = response.meta.get('front_image_url', '')  #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空)
#         #文章封面图
#         
#         title = response.css('.entry-header h1::text').extract()[0]
#         
#         create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·', '').strip()
#         
#         praise_nums = response.css('.vote-post-up h10::text').extract_first()
#         if praise_nums:
#             praise_nums = int(praise_nums[0])
#         else:
#             praise_nums = 0
#         
#         fav_nums = response.css('.bookmark-btn::text').extract()[0]
#         match_re = re.match(r'.*?(\d+).*', fav_nums)
#         if match_re:
#             fav_nums = int(match_re.group(1))
#         else:
#             fav_nums = 0
#         
#         comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
#         match_re = re.match(r'.*?(\d+).*', comment_nums)
#         if match_re:
#             comment_nums = int(match_re.group(1))
#         else:
#             comment_nums = 0
#         
#         content = response.css("div.entry").extract()[0]
#         
#         tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
#         tag_list = [element for element in tag_list if not element.strip().endswith('评论')] 
#         tags = ','.join(tag_list)
#         
#         #填充值到items
#         article_item['title'] = title
#         article_item['url'] = response.url
#         article_item['url_object_id'] = get_md5(response.url)  #对url做MD5
#         
#         try:  #为了将文章的创建时间写入数据库,要把str类型的create_time转换为date类型
#             create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date()  #将格式为%Y/%m/%d 的str类型转换为date类型
#         except Exception as e:
#             create_date = datetime.datetime.now().date()
#         article_item['create_date'] = create_date
#         
#         article_item['front_image_url'] = [front_image_url]  #images需要接受一个数组
#         article_item['praise_nums'] = praise_nums
#         article_item['fav_nums'] = fav_nums
#         article_item['comment_nums'] = comment_nums
#         article_item['tags'] = tags
#         article_item['content'] = content
        
        #通过itemLoader加载item
        front_image_url = response.meta.get('front_image_url', '')  #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空)
        #item_loader = ItemLoader(item=JobboleArticleItem(), response=response)  #定义ItemLoader实例
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)  #改用自定义的 ItemLoader
#         ItemLoader.add_css(self, field_name, css)
#         ItemLoader.add_xpath(self, field_name, xpath)
#         ItemLoader._add_value(self, field_name, value)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
        
        article_item = item_loader.load_item()
        #调用默认的item方法的话会有两个问题:1.值都是list 2.还需要对取出的值行进处理(做re的提取等)
        #-->去修改items.py  #1.在items.py 的Field()里面用TakeFirst进行处理  2.在items.py 的Field()里面用MapCompose进行处理
        
        yield article_item  #调用yield之后,item会传递到pipelines.py
Пример #18
0
    def parse_detail(self, response):
        """
        提取文章的具体字段, 回调函数
        :param response:
        :return:
        """
        article_item = JobBoleArticleItem()
        # 文章封面图
        front_image_url = response.meta.get("front_image_url", "")

        """
        '''
        # 方法一:【通过XPath提取字段】
        # 可以在浏览器inspect html里copy Xpath
        # chrome return 和 firefox return 可能不一样。 有时直接copy的值无法获得数据, 因为获取的是动态html而不是原始html
        
        # 标题
        # extract_first() 就是 extract()[0],还可以传一个default值
        title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("")
        # 创建时间
        create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first("").strip().replace("·", "").strip()
        # 点赞数
        vote_numbers = int(response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract_first(""))
        # 收藏数
        bookmark_numbers = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract_first("")
        match_re = re.match(r".*?(\d+).*", bookmark_numbers)
        if match_re:
            bookmark_numbers = match_re.group(1)
        # 评论数
        comment_numbers = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("")
        match_re = re.match(r".*?(\d+).*", comment_numbers)
        if match_re:
            comment_numbers = match_re.group(1)
        # 正文 (不提取text而是提取整个html结构)
        content = response.xpath("//div[@class='entry']").extract_first("")
        # 标签
        tags = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        tags = ','.join([item for item in tags if not item.strip().endswith('评论')])
        '''
        # 方法二:【通过CSS选择器提取字段】
        # 标题
        # ::text 代表去text
        title = response.css(".entry-header h1::text").extract_first("")
        # 创建时间
        create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first("").strip().replace("·","").strip()
        # 点赞数
        vote_numbers = int(response.css(".vote-post-up h10::text").extract_first(""))
        # 收藏数
        bookmark_numbers = response.css(".bookmark-btn::text").extract_first("")
        match_re = re.match(r".*?(\d+).*", bookmark_numbers)
        if match_re:
            bookmark_numbers = int(match_re.group(1))
        else:
            bookmark_numbers = 0
        # 评论数
        comment_numbers = response.css("a[href='#article-comment'] span::text").extract_first("")
        match_re = re.match(r".*?(\d+).*", comment_numbers)
        if match_re:
            comment_numbers = int(match_re.group(1))
        else:
            comment_numbers = 0
        # 正文 (不提取text而是提取整个html结构)
        content = response.css("div.entry").extract_first("")
        # 标签
        tags = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        tags = ','.join([item for item in tags if not item.strip().endswith('评论')])

        # 赋值生成item
        article_item['url_object_id'] = get_md5(response.url)
        article_item['title'] = title
        article_item['url'] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item['create_date'] = create_date
        # scrapy 的image下载接受的是数组
        article_item['front_image_url'] = [front_image_url]
        article_item['vote_numbers'] = vote_numbers
        article_item['bookmark_numbers'] = bookmark_numbers
        article_item['comment_numbers'] = comment_numbers
        article_item['tags'] = tags
        article_item['content'] = content
        """

        # 通过item loader 加载 item
        # 更加简洁可配置
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("vote_numbers", ".vote-post-up h10::text")
        item_loader.add_css("bookmark_numbers", ".bookmark-btn::text")
        item_loader.add_css("comment_numbers", "a[href='#article-comment'] span::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
Пример #19
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()

        # xpath选择器,提取文章的具体字段
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()
        # creat_data = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace("·", "")
        # praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first()
        # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn ")]/text()').extract_first()
        # match_re = re.match(".*?(\d+).*", "fav_nums")
        # if match_re:
        #     fav_nums = match_re.group(1)
        # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first()
        # match_re = re.match(".*?(\d+).*", "comment_nums")
        # if match_re:
        #     comment_nums = match_re.group(1)
        # content = response.xpath('//div[@class="entry"]').extract_first()
        # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # # endswith() 方法用于判断字符串是否以指定后缀结尾,如果以指定后缀结尾返回True,否则返回False
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)  # .join() 连接字符串数组,将字符串、元组、列表中的元素以指定的字符(分隔符)连接生成一个新的字符串

        # # css选择器
        # title = response.css('.entry-header h1::text').extract_first("")
        # create_data = response.css('p.entry-meta-hide-on-mobile ::text').extract_first("").strip().replace("·", "")
        # praise_nums = response.css('.vote-post-up h10::text').extract_first("")
        # fav_nums = response.css('.bookmark-btn ::text').extract_first("")
        # match_re = re.match(".*?(\d+).*", "fav_nums")
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # comment_nums = response.css('a[href="#article-comment"] ::text').extract_first("")
        # match_re = re.match(".*?(\d+).*", "comment_nums")
        # if match_re:
        #     comment_nums = int(match_re.group(1))
        # else:
        #     comment_nums = 0
        #
        # content = response.css('.entry').extract_first("")
        # tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        # article_item["title"] = title
        # article_item["url"] = response.url
        # article_item["url_object_id"] = get_md5(response.url)
        # # try:
        # #     create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date()
        # # except Exception as e:
        # #     create_data = datetime.datetime.now().date()
        # article_item["create_data"] = create_data
        # article_item["praise_nums"] = praise_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["front_image_url"] = [front_image_url]
        # article_item["comment_nums"] = comment_nums
        # article_item["content"] = content
        # article_item["tags"] = tags

        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_data", "p.entry-meta-hide-on-mobile ::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn ::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] ::text")
        item_loader.add_css("tags", ".entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", ".entry")
        article_item = item_loader.load_item()
        yield article_item
Пример #20
0
    def parse_detail(self, response):
        # 提取文章的具体字段
        article_item = JobboleArticleItem()

        # 图片
        image = response.meta.get("front_img", "")
        #
        # # 标题
        # title = response.xpath("//div[@class='entry-header']/h1/text()")
        # title_result = title.extract_first("")
        # # 创建时间
        # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()")
        # create_date_result = create_date.extract()[0].replace("·", "").strip()
        # # 点赞数
        # praise_num = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()")
        # praise_num_result = int(praise_num.extract()[0])
        # # 收藏数
        # fav_num = response.xpath("//span[contains(@class,'bookmark-btn')]/text()")
        # match_re = re.match(r".*?(\d+).*", fav_num.extract()[0])
        # if match_re:
        #     fav_num = int(match_re.group(1))
        # else:
        #     fav_num = 0
        #
        # # 评论数
        # comment_num = response.xpath("//a[@href='#article-comment']/span/text()")
        # match_re = re.match(".*?(\d+).*", comment_num.extract()[0])
        # if match_re:
        #     comment_num = int(match_re.group(1))
        # else:
        #     comment_num = 0
        # # 文章内容
        # # article_content=response.xpath("//")
        # content = response.xpath("//div[@class='entry']//text()").extract()
        # content_data = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # content_key = [content_key for content_key in content_data if not content_key.strip().endswith("评论")]
        # content_keys = ','.join(content_key)
        #
        # article_item['url'] = response.url
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['front_img_url'] = [image]
        # article_item['title'] = title_result
        # try:
        #     create_date_result = datetime.datetime.strptime(create_date_result, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date_result = datetime.datetime.now()
        # article_item['create_time'] = create_date_result
        # article_item['praise_num'] = praise_num_result
        # article_item['fav_num'] = fav_num
        # article_item['comment_num'] = comment_num
        # article_item['content'] = content
        # article_item['tags'] = content_keys

        # 通过item loader 加载item
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)
        item_loader.add_value('url', response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_img_url", [image])
        item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()")
        item_loader.add_xpath("create_time", "//p[@class='entry-meta-hide-on-mobile']/text()")
        item_loader.add_xpath("praise_num", "//span[contains(@class,'vote-post-up')]/h10/text()")
        item_loader.add_xpath("fav_num", "//span[contains(@class,'bookmark-btn')]/text()")
        item_loader.add_xpath("comment_num", "//a[@href='#article-comment']/span/text()")
        item_loader.add_xpath("content", "//div[@class='entry']//text()")
        item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")

        article_item= item_loader.load_item()

        yield article_item