Exemplo n.º 1
0
    def sync(self):
        feed = self.feed
        extractor = FeedExtractor(feed.channel_url)
        extractor.parse(etag=feed.last_etag_header, modified=feed.last_modified_header)
        article_models = []
        if extractor.is_modified:
            for article in extractor.articles:
                try:
                    if article.published_date:
                        from rdr.components.timezone import convert_to_local
                        published_date = convert_to_local(article.published_date)
                    else:
                        raise Exception('Empty published date')
                    if feed.last_update and published_date < feed.last_update:
                        continue
                    text = article.safe_text
                    title = article.title
                    import hashlib
                    s1 = hashlib.sha1()
                    check_string = (title + ' | ' + published_date.strftime('%Y-%m-%d %H:%M:%S')) \
                        .encode('utf-8')
                    s1.update(check_string)
                    hash_ = s1.hexdigest()
                    saved_article = Article.query.filter((Article.feed_id == feed.id) & (Article.hash == hash_)).first()
                    if saved_article is not None:
                        continue
                    fetched_date = DateTime.now()
                    article_model = Article(title=article.title,
                                            article_url=article.url,
                                            feed_id=feed.id,
                                            preview_text=html.nl2br(text),
                                            active=True,
                                            published=published_date,
                                            fetched=fetched_date,
                                            hash=hash_)
                    image_url = article.primary_image_url
                    if image_url:
                        if http.check_is_not_local_url(image_url) and http.check_is_absolute_url(image_url):
                            article_model.preview_image_src = image_url
                    article_models.append(article_model)
                    db.session.add(article_model)
                    db.session.commit()
                    if self.is_add_to_search_index:
                        from rdr.modules.feeds.search import ArticleSearchIndex
                        search_provider = ArticleSearchIndex(article_model)
                        search_provider.create_index()
                except Exception as e:
                    app.logger.exception(e)

            if not feed.active:
                feed.active = True
            feed.last_etag_header = extractor.etag_header
            feed.last_modified_header = extractor.modified_header
            feed.last_update = DateTime.now()
            db.session.commit()

        return article_models
Exemplo n.º 2
0
    def _resolve(self, query):
        query = http.encode_url(query)
        extractors = []
        ex = FeedExtractor(query)
        source = SourceExtractor(query)
        feeds = []
        try:
            source.parse()
        except Exception as e:
            app.logger.exception(e)
            source = None
        try:
            ex.parse()
            extractors.append(ex)
            if not source:
                app.logger.info('Try to extract source data via rss site url')
                source = SourceExtractor(ex.url)
                try:
                    source.parse()
                except Exception as e:
                    app.logger.exception(e)
                    source = None
        except InvalidFeedException as e:
            app.logger.exception(e)
            if not source or not source.feed_urls:
                return []
            app.logger.info('Try to get alternate RSS feeds from site url')
            for url in source.feed_urls:
                match_feed = Feed.query.filter((Feed.url == url) | (Feed.channel_url == url)).first()
                if match_feed:
                    feeds.append(match_feed)
                else:
                    # todo: multi threading parsing
                    app.logger.info('Try to extract feeds from alternate source: "%s"' % url)
                    ex = FeedExtractor(url)
                    ex.parse()
                    extractors.append(ex)
        if extractors:
            for extractor in extractors:
                lang = extractor.language
                if not lang and source:
                    lang = source.language
                if not lang:
                    lang = try_to_resolve_lang_by_domain(query)
                if lang:
                    lang = lang[:2]
                feed = Feed(url=extractor.url,
                            channel_url=extractor.channel_url,
                            title=extractor.title or 'Unnamed feed',
                            language=lang,
                            active=False,
                            created=datetime.now())
                db.session.add(feed)
                db.session.commit()
                db.session.add(FeedAliasKeyword(keyword=feed.title, feed_id=feed.id))
                if query != feed.url and query != feed.channel_url:
                    db.session.add(FeedAliasUrl(url=query, feed_id=feed.id))
                db.session.commit()
                feeds.append(feed)
                try:
                    image = None
                    if source:
                        image = source.retrieve_image()
                    if not image:
                        image = extractor.retrieve_image()
                    if image:
                        image.owner_id = None

                        db.session.add(image)
                        db.session.commit()

                        feed.icon_image_id = image.id
                        db.session.commit()
                except Exception as e:
                    app.logger.exception(e)
        if self.load_articles:
            app.logger.info('Load articles')
            # todo: multi threading loading
            for feed in feeds:
                try:
                    from rdr.modules.feeds.articles.sync import ArticlesSynchronizer
                    synchronizer = ArticlesSynchronizer(feed)
                    synchronizer.sync()
                except Exception as e:
                    app.logger.exception(e)
        return feeds