예제 #1
0
    def parse_item(self, response):
        title = response.xpath('//div[@class="article"]/h1/text()').get()
        avatar = response.xpath('//div[@class="author"]/a/img/@src').get()
        author = response.xpath('//div[@class="info"]/span/a/text()').get()
        pub_time = response.xpath('//div[@class="meta"]/span[@class="publish-time"]/text()').get()
        url = response.url
        url1 = url.split('?')[0]
        article_id = url1.split('/')[-1]
        content = response.xpath('//div[@class="show-content-free"]').get()
        word_count = response.xpath('//span[@class="wordage"]/text()').get()
        read_count = response.xpath('//span[@class="views-count"]/text()').get()
        like_count = response.xpath('//span[@class="likes-count"]/text()').get()
        subjects = ''.join(response.xpath('//div[@class="include-collection"]/a/div[@class="name"]/text()').getall())



        item = ArticleItem(
            title=title,
            avatar=avatar,
            author=author,
            pub_time=pub_time,
            article_id=article_id,
            content=content,
            origin_url=response.url,
            subjects=subjects,
            word_count=word_count,
            read_count=read_count,
            like_count=like_count
        )
        yield item
예제 #2
0
파일: js.py 프로젝트: jokerix/demo
    def parse_detail(self, response):
        title = response.xpath("//h1[@class = 'title']/text()").get()
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//span[@class = 'name']/a/text()").get()
        pub_time = response.xpath("//span[@class= 'publish-time']/text()").get().replace("*", "")
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split('/')[-1]
        content = response.xpath("//div[@class= 'show-content']").get()

        word_count = response.xpath("//span[@class = 'wordage']/text()").get()
        comment_count = response.xpath("//span[@class = 'comments-count']/text()").get()
        read_count = response.xpath("//span[@class = 'views-count']/text()").get()
        like_count = response.xpath("//span[@class = 'likes-count']/text()").get()
        subjects = '.'.join(response.xpath("//div[@class = 'include-collection']/a/div/text()").getall())

        item = ArticleItem(
            title=title,
            avatar=avatar,
            pub_time=pub_time,
            origin_url=response.url,
            article_id=article_id,
            content=content,
            subjects= subjects,
            word_count= word_count,
            comment_count= comment_count,
            read_count= read_count,
            like_count= like_count
        )
        yield item
예제 #3
0
    def parse_detail(self, response):
        item = ArticleItem()
        # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        # item['name'] = response.xpath('//div[@id="name"]').get()
        # item['description'] = response.xpath('//div[@id="description"]').get()
        item['title'] = response.xpath(
            "//div[@class='article']/h1/text()").get()
        item['author'] = response.xpath(
            "//div[@class='info']/span[@class='name']/a/text()").get()
        item['avatar'] = response.xpath(
            "//div[@class='author']/a[@class='avatar']/img/@src").get()
        item['content'] = response.xpath(
            "//div[@class='show-content-free']").get()
        item['pub_time'] = response.xpath(
            "//span[@class='publish-time']/text()").get().replace('*', '')

        # https://www.jianshu.com/p/f0c5934b5d3f?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation
        # https://www.jianshu.com/p/f03e0080e3e5
        item['article_id'] = response.url.split('?')[0].split('/')[-1]
        item['origin_url'] = response.url
        item['word_count'] = response.xpath(
            "//span[@class='wordage']/text()").get()
        item['read_count'] = response.xpath(
            "//span[@class='views-count']/text()").get()
        item['like_count'] = response.xpath(
            "//span[@class='likes-count']/text()").get()
        item['comment_count'] = response.xpath(
            "//span[@class='comments-count']/text()").get()
        subject = response.xpath(
            "//div[@class='include-collection']/a//div[contains(@class,'name')]/text()"
        ).getall()
        item['subject'] = ','.join(subject)

        print(item)
        yield item
예제 #4
0
 def parse_detail(self, response):
     title = response.xpath("//section[@class='ouvJEz']/h1[@class='_1RuRku']/text()").get()
     author = response.xpath("//span/text()").getall()[3]
     content = response.xpath("//article[@class ='_2rhmJa']").getall()
     item = ArticleItem(
         title=title,
         author=author,
         content=content
     )
     yield item
예제 #5
0
    def parse_detail(self, response):
        #获取文章内容
        title = response.xpath("//h1[@class='_1RuRku']/text()").get()
        author = response.xpath("//span[@class='FxYr8x']/a/text()").get()
        img = response.xpath("//a[@class='_1OhGeD']/img/@src").get()
        pub_time = response.xpath("//div[@class='s-dsoj']/time/text()").get()
        origin_url = response.url
        article_id = response.url.split("/")[-1]
        content = response.xpath("//article[@class='_2rhmJa']").get()

        items = ArticleItem(
            title = title,
            author=author,
            img=img,
            pub_time=pub_time,
            article_id = article_id,
            content = content,
            origin_url =origin_url
        )

        yield items
예제 #6
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='title']/text()").get()  # 标题
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()  # 头像url
        author = response.xpath("//span[@class='name']/a/text()").get()  # 作者
        pub_time = response.xpath(
            "//span[@class='publish-time']/text()").get()  # 发布时间
        origin_url = response.url  # 获取文章的url,然后解析这个url来获取文章的id
        url1 = origin_url.split("?")[0]  # 根据问号分割
        article_id = url1.split("/")[-1]  # 这就是文章的id
        content = response.xpath(
            "//div[@class='show-content']").get()  # 文章(保留标签格式)

        # 一下是ajax数据
        word_count = response.xpath(
            "//span[@class='wordage']/text()").get()  # 字数
        comment_count = response.xpath(
            "//span[@class='comments-count']/text()").get()  # 评论数
        like_count = response.xpath(
            "//span[@class='views-count']/text()").get()  # 喜欢数
        read_count = response.xpath(
            "//span[@class='likes-count']/text()").get()  # 阅读数
        subjects = response.xpath(
            "//div[@class='include-collection']/a/div/text()").getall(
            )  # 专题信息,返回的是列表
        subjects = ",".join(subjects)

        item = ArticleItem(title=title,
                           avatar=avatar,
                           author=author,
                           pub_time=pub_time,
                           origin_url=origin_url,
                           article_id=article_id,
                           content=content,
                           subjects=subjects,
                           word_count=word_count,
                           comment_count=comment_count,
                           read_count=read_count,
                           like_count=like_count)
        yield item
예제 #7
0
파일: js.py 프로젝트: Samantha09/Scrapy
    def parse_item(self, response):
        title = response.xpath("//h1[@class='title']/text()").get()
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//span[@class='name']/a/text()").get()
        pub_time = response.xpath("//span[@class='publish-time']/text()").get()
        url = response.url
        article_id = re.match(r'.*?/p/([0-9a-z]{12})[\?]*.*?',
                              response.url).group(1)
        content = response.xpath("//div[@class='show-content']").get()
        content = re.sub(r"<.*?>", "", content).strip()

        word_count = response.xpath("//span[@class='wordage']/text()").get()
        comment_count = response.xpath(
            "//span[@class='comments-count']/text()").get()
        like_count = response.xpath(
            "//span[@class='likes-count']/text()").get()
        read_count = response.xpath(
            "//span[@class='views-count']/text()").get()

        subjects = ",".join(
            response.xpath(
                "//div[@class='include-collection']/a/div/text()").getall())

        item = ArticleItem(title=title,
                           avatar=avatar,
                           author=author,
                           origin_url=url,
                           pub_time=pub_time,
                           article_id=article_id,
                           content=content,
                           word_count=word_count,
                           comment_count=comment_count,
                           like_count=like_count,
                           read_count=read_count,
                           subjects=subjects)
        yield item
예제 #8
0
    def parse_item(self, response):
        '''
            scrapy shell url
            title = scrapy.Field()
            avatar = scrapy.Field()
            author = scrapy.Field()
            pub_time = scrapy.Field()
            article_id = scrapy.Field()
        '''
        title = response.xpath('//h1[@class="_1RuRku"]/text()').get()
        avatar = response.xpath('//meta[@property="og:image"]/@content').get()
        author = response.xpath('//span[@class="_22gUMi"]/text()').get()

        # TODO 用正则取有问题,暂时不取了
        # pub_time = response.body.re(r'last_updated_at":(.*?),')

        url = response.url
        urlArr = url.split('?')
        urlRes = urlArr[0]
        article_id = urlRes.split('/')[-1]
        content = ''.join(
            response.xpath('//article[@class="_2rhmJa"]//text()').getall())

        now = int(round(time.time() * 1000))
        pub_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                 time.localtime(now / 1000))

        try:
            like_count = int(
                response.xpath('//span[@class="_1GPnWJ"]/text()').get())
            commit_count = int(
                response.xpath(
                    '//div[@class="_3nj4GN"]//span[1]/text()[2]').get())
            word_count = int(
                response.xpath(
                    '//div[@class="s-dsoj"]//span[2]/text()').get().replace(
                        ' ', '').replace(',', '').replace('字数', ''))
            subjects = ','.join(
                response.xpath('//span[@class="_2-Djqu"]//text()').getall())
            read_count = int(
                response.xpath(
                    '//div[@class="s-dsoj"]//span[3]/text()').get().replace(
                        ' ', '').replace(',', '').replace('阅读', ''))

            item = ArticleItem(title=title,
                               avatar=avatar,
                               author=author,
                               pub_time=pub_time,
                               article_id=article_id,
                               origin_url=url,
                               content=content,
                               like_count=like_count,
                               commit_count=commit_count,
                               word_count=word_count,
                               subjects=subjects,
                               read_count=read_count)
            yield item
        except:
            print('捕获到异常' + '*' * 30)

            item = ArticleItem(title=title,
                               avatar=avatar,
                               author=author,
                               pub_time=pub_time,
                               article_id=article_id,
                               origin_url=url,
                               content=content)
            yield item

            pass
        pass
예제 #9
0
 def parse(self, response):
     # 获取文章总数
     article_count = response.css(
         'body > div.container.person > div > div.col-xs-16.main > div.main-top > div.info > ul > li:nth-child(3) > div > a > p::text').extract_first()
     # 计算共分了多少页,每页9篇文章
     countPageNumber = int(int(article_count) / 9 + 0.5)
     # 获取文章列表
     articles = response.css('ul.note-list > li')
     for article in articles:
         articleItem = ArticleItem()
         # 获取作者名称
         articleItem['author_name'] = article.css('a.blue-link::text').extract_first()
         # author_name = article.css('a.blue-link::text').extract_first()
         # 获取作者的头像连接
         articleItem['author_image'] = 'http:' + article.css('div.author > a > img::attr(src)').extract_first()
         # author_image = 'http:' + article.css('div.author > a > img::attr(src)').extract_first()
         # 获取文章发布时间
         articleItem['article_release_time'] = article.css('div.name > span.time::attr(data-shared-at)').extract_first()
         article_release_time = article.css('div.name > span.time::attr(data-shared-at)').extract_first()
         # 获取标题
         articleItem['article_title'] = article.css('a.title::text').extract_first()
         # article_title = article.css('a.title::text').extract_first()
         # 获取文章描述 
         articleItem['article_desc'] = article.css('p.abstract::text').extract_first().strip()
         # article_desc = article.css('p.abstract::text').extract_first().strip()
         # 获取文章链接
         articleItem['article_link'] = JsmemberspiderSpider.jianshu + article.css('div.content > a::attr(href)').extract_first()
         # article_link = JsmemberspiderSpider.jianshu + article.css('div.content > a::attr(href)').extract_first()
         # 获取阅读量,回复量,喜欢人数,赞赏人数
         articleItem['read_count'] = article.css('div.meta > a')[0].css('::text').extract()[-1].strip()
         articleItem['reply_count'] = article.css('div.meta > a')[1].css('::text').extract()[-1].strip()
         articleItem['likeit_count'] = article.css('div.meta > span')[0].css('::text').extract_first().strip()
         articleItem['payit_count'] = article.css('div.meta > span')[1].css('::text').extract_first().strip() if len(article.css('div.meta > span'))>=2 else 0
         # read_count = article.css('div.meta > a')[0].css('::text').extract()[-1].strip()
         # reply_count = article.css('div.meta > a')[1].css('::text').extract()[-1].strip()
         # likeit_count = article.css('div.meta > span')[0].css('::text').extract_first().strip()
         # payit_count = article.css('div.meta > span')[1].css('::text').extract_first().strip() if len(article.css('div.meta > span'))>=2 else 0
         # yield {
         #     'author_name': author_name, 
         #     'author_image': author_image, 
         #     'article_release_time': article_release_time, 
         #     'article_title': article_title,
         #     'article_desc': article_desc,
         #     'article_link': article_link,
         #     'read_count': read_count, 
         #     'reply_count': reply_count, 
         #     'likeit_count': likeit_count, 
         #     'payit_count': payit_count,
         # }
         JsmemberspiderSpider.article_all.append(articleItem)
         yield articleItem
         # pages = (i for i in range(2, countPageNumber + 1))
         current_page = int(response.url.split('page=')[1])
         next_page = JsmemberspiderSpider.start_url.format(current_page + 1)
         # 爬虫结束的条件,如果当前页是最后一页
         if current_page == countPageNumber:
             next_page = None
         if next_page is not None:
             next_page = response.urljoin(next_page)
             # yield {
             # '爬取中:': next_page,
             # }
             yield scrapy.Request(next_page, callback=self.parse)
예제 #10
0
    def parse_html(self, response):
        loader = ArticleItem(item=JianshuItem(), response=response)
        loader.add_xpath('title', "//div[@class='_gp-ck']//h1/text()")
        loader.add_xpath('num', "//div[@class='s-dsoj']/span[2]/text()")
        loader.add_xpath('look', "//div[@class='s-dsoj']/span[3]/text()")
        loader.add_xpath('author', "//span[@class='_22gUMi']/text()")
        loader.add_xpath('favor', "//span[@class='_1LOh_5']/text()")
        loader.add_xpath('time', "//time/text()")
        loader.add_xpath('content', "//article[@class='_2rhmJa']//text()")
        loader.add_value('url', response.url)

        article_item = loader.load_item()

        yield article_item