def sync(self): feed = self.feed extractor = FeedExtractor(feed.channel_url) extractor.parse(etag=feed.last_etag_header, modified=feed.last_modified_header) article_models = [] if extractor.is_modified: for article in extractor.articles: try: if article.published_date: from rdr.components.timezone import convert_to_local published_date = convert_to_local(article.published_date) else: raise Exception('Empty published date') if feed.last_update and published_date < feed.last_update: continue text = article.safe_text title = article.title import hashlib s1 = hashlib.sha1() check_string = (title + ' | ' + published_date.strftime('%Y-%m-%d %H:%M:%S')) \ .encode('utf-8') s1.update(check_string) hash_ = s1.hexdigest() saved_article = Article.query.filter((Article.feed_id == feed.id) & (Article.hash == hash_)).first() if saved_article is not None: continue fetched_date = DateTime.now() article_model = Article(title=article.title, article_url=article.url, feed_id=feed.id, preview_text=html.nl2br(text), active=True, published=published_date, fetched=fetched_date, hash=hash_) image_url = article.primary_image_url if image_url: if http.check_is_not_local_url(image_url) and http.check_is_absolute_url(image_url): article_model.preview_image_src = image_url article_models.append(article_model) db.session.add(article_model) db.session.commit() if self.is_add_to_search_index: from rdr.modules.feeds.search import ArticleSearchIndex search_provider = ArticleSearchIndex(article_model) search_provider.create_index() except Exception as e: app.logger.exception(e) if not feed.active: feed.active = True feed.last_etag_header = extractor.etag_header feed.last_modified_header = extractor.modified_header feed.last_update = DateTime.now() db.session.commit() return article_models
def fetch_images(self, obj): result = [] if 'images' in obj: for img in obj['images']: if 'url' in img and img['url']: if http.check_is_absolute_url(img['url']) and http.check_is_not_local_url(img['url']): result.append({ 'primary': img.get('primary', False), 'src': img['url'] }) return result
def fetch_article_full_text(self, article): from newspaper import Article as NewspaperArticle from newspaper.utils import get_available_languages feed = article.feed if not feed: raise Exception('Can\'t fetch full text for article without feed') lang = feed.language supported_langs = get_available_languages() if not lang or lang not in supported_langs: app.logger.warning('%s not in newspaper languages list' % lang) lang = None if article.preview_text: try: from langdetect import detect lang = detect(article.preview_text) if lang not in supported_langs: lang = None except ImportError: pass except Exception as e: app.logger.exception(e) if not lang: lang = 'en' np_article = NewspaperArticle(article.article_url, language=lang, browser_user_agent=app.config.get('DEFAULT_USER_AGENT'), keep_article_html=True) np_article.download() np_article.parse() images = [] top_image_url = np_article.top_image if top_image_url: if http.check_is_absolute_url(top_image_url) and http.check_is_not_local_url(top_image_url): images.append({ 'src': top_image_url, 'primary': True }) return ArticleFullTextResult(title=np_article.title, text=np_article.text, images=images)