def crawl_topnews(self, lang="en"): """ Returns a list of News objects representing the top news from GoogleNews @param lang='en' the language of the news (optional) @returns list of News objects with the top news """ params = {"cf": "all", "ned": "us", "hl": lang} url = self.build_url(params) feed = feedparser.parse(url) for e in feed["entries"]: url = urlparse(e["link"]) clean_url = dict([part.split("=") for part in url.query.split("&")])["url"] e["lang"] = lang e["link"] = clean_url news = News(e) news.save()
def __fetch_news(self, news_data): fetch_log('fetch news id %d' %(news_data['news_id'])) data = fetch_data(newsUrl + str(news_data['news_id'])) fetch_log('fetched news id %d' %(news_data['news_id'])) if data is None or news_data['news_id'] != data['id']: return None data['body'] = parse_news_body(data['body']) fetch_log('parsed news body') try: data['image'] = upload_to_qiniu(data['image']) except KeyError: data['image'] = 'default-lg.jpg' fetch_log('image uploaded') data['thumbnail'] = upload_to_qiniu(news_data['thumbnail']) fetch_log('thumbnail uploaded') data['date'] = datetime.strptime(news_data['date'], '%Y%m%d') news = News(news_id=int(data['id'])) news.save(data) fetch_log('news %s saved' % int(data['id'])) return data['id']