示例#1
0
    def processArticle(self, response):
        url = response.url
        score = calc_score(url)
        if score >= 3:
            try:
                print 'this url maybe a news_url'
                item = ArticleItem()
                g = Goose()
                article = g.extract(url=url)
                title = article.title
                content = article.cleaned_text
                if len(content) == 0:
                    print 'news in chinese'
                    g = Goose({'stopwords_class': StopWordsChinese})
                    article = g.extract(url=url)
                    content = article.cleaned_text
                item['articleTitle'] = title
                item['articleUrl'] = url
                item['articleContent'] = content
                yield item
            except:
                self.logger.info('item in article failed')

        else:
            print 'this url maybe not a news_url, ' + ' score only ' + str(
                score)
            print 'you can check this url: ' + url
            return
示例#2
0
 def processArticle(self,response):
     try:
         item = ArticleItem()
         title = response.xpath('''//h1/text()''').extract()[0]
         content = ''.join(response.xpath('''//div[@class='article-content-wrap']//p/text()''').extract())
         item['articleTitle'] = title
         item['articleUrl'] = ''.join(response.url)
         item['articleContent'] = content
         yield item
     except:
         self.logger.info('item in article failed')
示例#3
0
 def processArticle(self, response):
     try:
         item = ArticleItem()
         title = response.xpath(
             '''//div[@id="jh604"]/div[@class="guojiA10232_ind03"]/h3/text()'''
         ).extract()[0]
         content = ''.join(
             response.xpath('''//div[@class='text']/p/text()''').extract())
         item['articleTitle'] = title
         item['articleUrl'] = ''.join(response.url)
         item['articleContent'] = content
         yield item
     except:
         self.logger.info('item in article failed')
示例#4
0
 def processArticle(self, response):
     try:
         item = ArticleItem()
         title = response.xpath(
             '''//div[@class="column"]//div[@class="bg-content"]/h1/text()'''
         ).extract()[0]
         content = ''.join(
             response.xpath(
                 '''//div[@class='inner']/div[@class="fs-small cont-detail det article-content ov"]/p/text()'''
             ).extract())
         item['articleTitle'] = title
         item['articleUrl'] = ''.join(response.url)
         item['articleContent'] = content
         yield item
     except:
         self.logger.info('item in article failed')