def sync(self): feed = self.feed extractor = FeedExtractor(feed.channel_url) extractor.parse(etag=feed.last_etag_header, modified=feed.last_modified_header) article_models = [] if extractor.is_modified: for article in extractor.articles: try: if article.published_date: from rdr.components.timezone import convert_to_local published_date = convert_to_local(article.published_date) else: raise Exception('Empty published date') if feed.last_update and published_date < feed.last_update: continue text = article.safe_text title = article.title import hashlib s1 = hashlib.sha1() check_string = (title + ' | ' + published_date.strftime('%Y-%m-%d %H:%M:%S')) \ .encode('utf-8') s1.update(check_string) hash_ = s1.hexdigest() saved_article = Article.query.filter((Article.feed_id == feed.id) & (Article.hash == hash_)).first() if saved_article is not None: continue fetched_date = DateTime.now() article_model = Article(title=article.title, article_url=article.url, feed_id=feed.id, preview_text=html.nl2br(text), active=True, published=published_date, fetched=fetched_date, hash=hash_) image_url = article.primary_image_url if image_url: if http.check_is_not_local_url(image_url) and http.check_is_absolute_url(image_url): article_model.preview_image_src = image_url article_models.append(article_model) db.session.add(article_model) db.session.commit() if self.is_add_to_search_index: from rdr.modules.feeds.search import ArticleSearchIndex search_provider = ArticleSearchIndex(article_model) search_provider.create_index() except Exception as e: app.logger.exception(e) if not feed.active: feed.active = True feed.last_etag_header = extractor.etag_header feed.last_modified_header = extractor.modified_header feed.last_update = DateTime.now() db.session.commit() return article_models
def _resolve(self, query): query = http.encode_url(query) extractors = [] ex = FeedExtractor(query) source = SourceExtractor(query) feeds = [] try: source.parse() except Exception as e: app.logger.exception(e) source = None try: ex.parse() extractors.append(ex) if not source: app.logger.info('Try to extract source data via rss site url') source = SourceExtractor(ex.url) try: source.parse() except Exception as e: app.logger.exception(e) source = None except InvalidFeedException as e: app.logger.exception(e) if not source or not source.feed_urls: return [] app.logger.info('Try to get alternate RSS feeds from site url') for url in source.feed_urls: match_feed = Feed.query.filter((Feed.url == url) | (Feed.channel_url == url)).first() if match_feed: feeds.append(match_feed) else: # todo: multi threading parsing app.logger.info('Try to extract feeds from alternate source: "%s"' % url) ex = FeedExtractor(url) ex.parse() extractors.append(ex) if extractors: for extractor in extractors: lang = extractor.language if not lang and source: lang = source.language if not lang: lang = try_to_resolve_lang_by_domain(query) if lang: lang = lang[:2] feed = Feed(url=extractor.url, channel_url=extractor.channel_url, title=extractor.title or 'Unnamed feed', language=lang, active=False, created=datetime.now()) db.session.add(feed) db.session.commit() db.session.add(FeedAliasKeyword(keyword=feed.title, feed_id=feed.id)) if query != feed.url and query != feed.channel_url: db.session.add(FeedAliasUrl(url=query, feed_id=feed.id)) db.session.commit() feeds.append(feed) try: image = None if source: image = source.retrieve_image() if not image: image = extractor.retrieve_image() if image: image.owner_id = None db.session.add(image) db.session.commit() feed.icon_image_id = image.id db.session.commit() except Exception as e: app.logger.exception(e) if self.load_articles: app.logger.info('Load articles') # todo: multi threading loading for feed in feeds: try: from rdr.modules.feeds.articles.sync import ArticlesSynchronizer synchronizer = ArticlesSynchronizer(feed) synchronizer.sync() except Exception as e: app.logger.exception(e) return feeds