Python ArticleItem示例

编程语言: Python

命名空间/包名称: mi.items

类/类型: ArticleItem

hotexamples.com的示例: 4

Python ArticleItem - 已找到4个示例。这些是从开源项目中提取的最受好评的mi.items.ArticleItem现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

ArticleItem(4)

常用方法

ArticleItem (4)

示例#1

显示文件

文件： spider_mengzichengcn.py 项目： zl1234/mi

    def processArticle(self, response):
        url = response.url
        score = calc_score(url)
        if score >= 3:
            try:
                print 'this url maybe a news_url'
                item = ArticleItem()
                g = Goose()
                article = g.extract(url=url)
                title = article.title
                content = article.cleaned_text
                if len(content) == 0:
                    print 'news in chinese'
                    g = Goose({'stopwords_class': StopWordsChinese})
                    article = g.extract(url=url)
                    content = article.cleaned_text
                item['articleTitle'] = title
                item['articleUrl'] = url
                item['articleContent'] = content
                yield item
            except:
                self.logger.info('item in article failed')

        else:
            print 'this url maybe not a news_url, ' + ' score only ' + str(
                score)
            print 'you can check this url: ' + url
            return

示例#2

显示文件

 def processArticle(self,response):
     try:
         item = ArticleItem()
         title = response.xpath('''//h1/text()''').extract()[0]
         content = ''.join(response.xpath('''//div[@class='article-content-wrap']//p/text()''').extract())
         item['articleTitle'] = title
         item['articleUrl'] = ''.join(response.url)
         item['articleContent'] = content
         yield item
     except:
         self.logger.info('item in article failed')

示例#3

显示文件

 def processArticle(self, response):
     try:
         item = ArticleItem()
         title = response.xpath(
             '''//div[@id="jh604"]/div[@class="guojiA10232_ind03"]/h3/text()'''
         ).extract()[0]
         content = ''.join(
             response.xpath('''//div[@class='text']/p/text()''').extract())
         item['articleTitle'] = title
         item['articleUrl'] = ''.join(response.url)
         item['articleContent'] = content
         yield item
     except:
         self.logger.info('item in article failed')

示例#4

显示文件

 def processArticle(self, response):
     try:
         item = ArticleItem()
         title = response.xpath(
             '''//div[@class="column"]//div[@class="bg-content"]/h1/text()'''
         ).extract()[0]
         content = ''.join(
             response.xpath(
                 '''//div[@class='inner']/div[@class="fs-small cont-detail det article-content ov"]/p/text()'''
             ).extract())
         item['articleTitle'] = title
         item['articleUrl'] = ''.join(response.url)
         item['articleContent'] = content
         yield item
     except:
         self.logger.info('item in article failed')