示例#1
0
 def setUp(self):
     super().setUp()
     self.actrl = ArticleController()
     article = self.actrl.read().first()
     ClusterController().delete(article.cluster_id, delete_articles=False)
     self.article = self.actrl.get(id=article.id)
     content_generator.get_content_generator.cache_clear()
示例#2
0
文件: cluster.py 项目: jaesivsm/JARR
    def _preprocess_per_article_filters(self, filters):
        """Removing filters aimed at articles and transform them into filters
        for clusters"""
        art_filters = {}
        for key in {'__or__', 'title__ilike', 'content__ilike'}\
                   .intersection(filters):
            art_filters[key] = filters.pop(key)

        if art_filters:
            art_contr = ArticleController(self.user_id)
            filters['id__in'] = {
                line[0]
                for line in art_contr.read(
                    **art_filters).with_entities(Article.cluster_id)
            }
示例#3
0
 def clusterize_pending_articles(self):
     results = []
     actrl = ArticleController(self.user_id)
     articles = list(actrl.read(cluster_id=None))
     logger.info('got %d articles to clusterize', len(articles))
     WORKER_BATCH.labels(worker_type='clusterizer').observe(len(articles))
     for article in actrl.read(cluster_id=None):
         filter_result = process_filters(article.feed.filters, {
             'tags': article.tags,
             'title': article.title,
             'link': article.link
         })
         result = self.clusterize(article, filter_result).id
         results.append(result)
     return results
示例#4
0
文件: cluster.py 项目: jaesivsm/JARR
 def clusterize_pending_articles(self):
     results = []
     actrl = ArticleController(self.user_id)
     art_count = actrl.read(cluster_id=None).count()
     logger.info('User(%s) got %d articles to clusterize', self.user_id,
                 art_count)
     WORKER_BATCH.labels(worker_type='clusterizer').observe(art_count)
     clusterizer = Clusterizer(self.user_id)
     for article in actrl.read(cluster_id=None):
         filter_result = process_filters(article.feed.filters, {
             'tags': article.tags,
             'title': article.title,
             'link': article.link
         })
         result = clusterizer.main(article, filter_result).id
         results.append(result)
     return results
示例#5
0
    def _get_query_for_clustering(self, article, filters, filter_tfidf=False):
        time_delta = timedelta(
            days=get_cluster_pref(article.feed, 'time_delta'))
        date_cond = {
            'date__lt': article.date + time_delta,
            'date__gt': article.date - time_delta
        }
        retr_cond = {
            'retrieved_date__lt': article.retrieved_date + time_delta,
            'retrieved_date__gt': article.retrieved_date - time_delta
        }
        filters.update({
            'cluster_id__ne': None,
            'user_id': article.user_id,
            'id__ne': article.id,
            '__or__': [date_cond, retr_cond]
        })
        if article.category_id and not is_same_ok(article, 'category'):
            filters['category_id__ne'] = article.category_id
        if not is_same_ok(article, 'feed'):
            filters['feed_id__ne'] = article.feed_id

        feed_join = [
            Feed.id == Article.feed_id,
            or_(Feed.cluster_enabled.__eq__(True),
                Feed.cluster_enabled.__eq__(None))
        ]
        if filter_tfidf:
            feed_join.append(
                or_(Feed.cluster_tfidf_enabled.__eq__(True),
                    Feed.cluster_tfidf_enabled.__eq__(None)))

        query = ArticleController(self.user_id).read(**filters)\
                .join(Feed, and_(*feed_join))

        # operations involving categories are complicated, handling in software
        for candidate in query:
            if not get_config(candidate, "cluster_enabled"):
                CLUSTERING.labels(filters="allow",
                                  config="target-forbid",
                                  result="miss",
                                  match="none").inc()
                continue
            if filter_tfidf \
                    and not get_config(candidate, "cluster_tfidf_enabled"):
                CLUSTERING.labels(filters="allow",
                                  config="target-forbid",
                                  result="miss",
                                  match="tfidf").inc()
                continue
            yield candidate
示例#6
0
文件: cluster.py 项目: jaesivsm/JARR
 def delete(self, obj_id, delete_articles=True):
     self.update({'id': obj_id}, {'main_article_id': None}, commit=False)
     actrl = ArticleController(self.user_id)
     if delete_articles:
         for art in actrl.read(cluster_id=obj_id):
             actrl.delete_only_article(art, commit=False)
     else:
         actrl.update({'cluster_id': obj_id}, {
             'cluster_id': None,
             'cluster_reason': None,
             'cluster_score': None,
             'cluster_tfidf_with': None,
             'cluster_tfidf_neighbor_size': None
         })
     return super().delete(obj_id)
示例#7
0
class ContentGeneratorTest(JarrFlaskCommon):
    def setUp(self):
        super().setUp()
        self.actrl = ArticleController()
        article = self.actrl.read().first()
        ClusterController().delete(article.cluster_id, delete_articles=False)
        self.article = self.actrl.get(id=article.id)
        content_generator.get_content_generator.cache_clear()

    def set_truncated_content(self, **kwargs):
        kwargs.update({'truncated_content': True})
        FeedController().update({'id': self.article.feed.id}, kwargs)

    @patch('jarr.controllers.article.ArticleController.enhance')
    def test_article_image_enhancement(self, enhance=None):
        self.actrl.update({'id': self.article.id}, {
            'article_type': 'image',
            'vector': None
        })
        self.assertEqual(content_generator.ImageContentGenerator,
                         self.article.content_generator.__class__)
        Clusterizer().main(self.article)
        self.assertEqual('image', self.article.cluster.content.get('type'))
        self.assertEqual(self.article.link,
                         self.article.cluster.content.get('src'))
        self.assertEqual(1, enhance.call_count)
        self.assertIsNone(self.article.vector)

    @patch('jarr.controllers.article.ArticleController.enhance')
    def test_article_embedded_enhancement(self, enhance=None):
        self.actrl.update({'id': self.article.id}, {
            'article_type': 'embedded',
            'link': "https://www.youtube.com/"
            "watch?v=scbrjaqM3Oc",
            "vector": None
        })
        self.assertEqual(content_generator.EmbeddedContentGenerator,
                         self.article.content_generator.__class__)
        Clusterizer().main(self.article)
        self.assertEqual("embedded", self.article.cluster.content.get('type'))
        self.assertEqual("youtube", self.article.cluster.content.get('player'))
        self.assertEqual("scbrjaqM3Oc",
                         self.article.cluster.content.get('videoId'))
        self.assertEqual(1, enhance.call_count)
        self.assertIsNone(self.article.vector)

    def test_article_image_enhancement_on_truncated(self):
        self.set_truncated_content()
        self.test_article_image_enhancement()

    def test_article_embedded_enhancement_on_truncated(self):
        self.set_truncated_content()
        self.test_article_embedded_enhancement()

    @patch('jarr.lib.content_generator.Goose')
    @patch('jarr.lib.content_generator.ContentGenerator._from_goose_to_html')
    def test_article_truncated_enhancement(
            self,
            from_goose=None,
            goose=None,
            cg=content_generator.TruncatedContentGenerator):
        from_goose.return_value = 'my collated content'
        patched_goose = Mock(opengraph={'locale': 'en'},
                             meta_lang='fr',
                             final_url='my final url',
                             meta_keywords='Monthy Python, Brian',
                             tags=['The Holy Graal', 'Monthy Python'],
                             title='Flying Circus',
                             cleaned_text="Bring out your dead !")
        goose.return_value.extract.return_value = patched_goose
        self.set_truncated_content()
        self.assertEqual(cg, self.article.content_generator.__class__)
        self.article = self.actrl.get(id=self.article.id)
        Clusterizer().main(self.article)
        self.assertEqual(3, len(self.article.tags))
        self.assertEqual('en', self.article.lang)
        self.assertEqual('Flying Circus', self.article.title)
        self.assertEqual('Flying Circus', self.article.cluster.main_title)
        self.assertEqual(
            {
                'brian': 1,
                'bring': 1,
                'circus': 1,
                'dead': 1,
                'fli': 1,
                'graal': 1,
                'holi': 1,
                'monthi': 1,
                'python': 1
            }, self.article.simple_vector)

    @patch('jarr.lib.content_generator.Goose')
    @patch('jarr.lib.content_generator.ContentGenerator._from_goose_to_html')
    def test_reddit_original_enhancement(self, from_goose, goose):
        self.set_truncated_content(feed_type='reddit')
        self.actrl.update({'id': self.article.id},
                          {'comments': self.article.link})
        self.assertEqual(content_generator.RedditContentGenerator,
                         self.article.content_generator.__class__)
        Clusterizer().main(self.article)
        self.assertEqual(0, from_goose.call_count)
        self.assertEqual(0, goose.call_count)
        self.assertEqual({}, self.article.cluster.content)

    @patch('jarr.lib.content_generator.Goose')
    @patch('jarr.lib.content_generator.ContentGenerator._from_goose_to_html')
    def test_reddit_image_link_enhancement(self, from_goose, goose):
        self.set_truncated_content(feed_type='reddit')
        self.actrl.update({'id': self.article.id}, {'article_type': 'image'})
        self.test_article_image_enhancement()
        self.assertEqual(content_generator.ImageContentGenerator,
                         self.article.content_generator.__class__)
        self.assertEqual(0, from_goose.call_count)
        self.assertEqual(0, goose.call_count)

    def test_reddit_truncated_enhancement(self):
        self.set_truncated_content(feed_type='reddit')
        self.test_article_truncated_enhancement(
            cg=content_generator.RedditContentGenerator)