def processArticle(self, response): url = response.url score = calc_score(url) if score >= 3: try: print 'this url maybe a news_url' item = ArticleItem() g = Goose() article = g.extract(url=url) title = article.title content = article.cleaned_text if len(content) == 0: print 'news in chinese' g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url) content = article.cleaned_text item['articleTitle'] = title item['articleUrl'] = url item['articleContent'] = content yield item except: self.logger.info('item in article failed') else: print 'this url maybe not a news_url, ' + ' score only ' + str( score) print 'you can check this url: ' + url return
def processArticle(self,response): try: item = ArticleItem() title = response.xpath('''//h1/text()''').extract()[0] content = ''.join(response.xpath('''//div[@class='article-content-wrap']//p/text()''').extract()) item['articleTitle'] = title item['articleUrl'] = ''.join(response.url) item['articleContent'] = content yield item except: self.logger.info('item in article failed')
def processArticle(self, response): try: item = ArticleItem() title = response.xpath( '''//div[@id="jh604"]/div[@class="guojiA10232_ind03"]/h3/text()''' ).extract()[0] content = ''.join( response.xpath('''//div[@class='text']/p/text()''').extract()) item['articleTitle'] = title item['articleUrl'] = ''.join(response.url) item['articleContent'] = content yield item except: self.logger.info('item in article failed')
def processArticle(self, response): try: item = ArticleItem() title = response.xpath( '''//div[@class="column"]//div[@class="bg-content"]/h1/text()''' ).extract()[0] content = ''.join( response.xpath( '''//div[@class='inner']/div[@class="fs-small cont-detail det article-content ov"]/p/text()''' ).extract()) item['articleTitle'] = title item['articleUrl'] = ''.join(response.url) item['articleContent'] = content yield item except: self.logger.info('item in article failed')