def setUp(self): super().setUp() self.actrl = ArticleController() article = self.actrl.read().first() ClusterController().delete(article.cluster_id, delete_articles=False) self.article = self.actrl.get(id=article.id) content_generator.get_content_generator.cache_clear()
def _preprocess_per_article_filters(self, filters): """Removing filters aimed at articles and transform them into filters for clusters""" art_filters = {} for key in {'__or__', 'title__ilike', 'content__ilike'}\ .intersection(filters): art_filters[key] = filters.pop(key) if art_filters: art_contr = ArticleController(self.user_id) filters['id__in'] = { line[0] for line in art_contr.read( **art_filters).with_entities(Article.cluster_id) }
def clusterize_pending_articles(self): results = [] actrl = ArticleController(self.user_id) articles = list(actrl.read(cluster_id=None)) logger.info('got %d articles to clusterize', len(articles)) WORKER_BATCH.labels(worker_type='clusterizer').observe(len(articles)) for article in actrl.read(cluster_id=None): filter_result = process_filters(article.feed.filters, { 'tags': article.tags, 'title': article.title, 'link': article.link }) result = self.clusterize(article, filter_result).id results.append(result) return results
def clusterize_pending_articles(self): results = [] actrl = ArticleController(self.user_id) art_count = actrl.read(cluster_id=None).count() logger.info('User(%s) got %d articles to clusterize', self.user_id, art_count) WORKER_BATCH.labels(worker_type='clusterizer').observe(art_count) clusterizer = Clusterizer(self.user_id) for article in actrl.read(cluster_id=None): filter_result = process_filters(article.feed.filters, { 'tags': article.tags, 'title': article.title, 'link': article.link }) result = clusterizer.main(article, filter_result).id results.append(result) return results
def _get_query_for_clustering(self, article, filters, filter_tfidf=False): time_delta = timedelta( days=get_cluster_pref(article.feed, 'time_delta')) date_cond = { 'date__lt': article.date + time_delta, 'date__gt': article.date - time_delta } retr_cond = { 'retrieved_date__lt': article.retrieved_date + time_delta, 'retrieved_date__gt': article.retrieved_date - time_delta } filters.update({ 'cluster_id__ne': None, 'user_id': article.user_id, 'id__ne': article.id, '__or__': [date_cond, retr_cond] }) if article.category_id and not is_same_ok(article, 'category'): filters['category_id__ne'] = article.category_id if not is_same_ok(article, 'feed'): filters['feed_id__ne'] = article.feed_id feed_join = [ Feed.id == Article.feed_id, or_(Feed.cluster_enabled.__eq__(True), Feed.cluster_enabled.__eq__(None)) ] if filter_tfidf: feed_join.append( or_(Feed.cluster_tfidf_enabled.__eq__(True), Feed.cluster_tfidf_enabled.__eq__(None))) query = ArticleController(self.user_id).read(**filters)\ .join(Feed, and_(*feed_join)) # operations involving categories are complicated, handling in software for candidate in query: if not get_config(candidate, "cluster_enabled"): CLUSTERING.labels(filters="allow", config="target-forbid", result="miss", match="none").inc() continue if filter_tfidf \ and not get_config(candidate, "cluster_tfidf_enabled"): CLUSTERING.labels(filters="allow", config="target-forbid", result="miss", match="tfidf").inc() continue yield candidate
def delete(self, obj_id, delete_articles=True): self.update({'id': obj_id}, {'main_article_id': None}, commit=False) actrl = ArticleController(self.user_id) if delete_articles: for art in actrl.read(cluster_id=obj_id): actrl.delete_only_article(art, commit=False) else: actrl.update({'cluster_id': obj_id}, { 'cluster_id': None, 'cluster_reason': None, 'cluster_score': None, 'cluster_tfidf_with': None, 'cluster_tfidf_neighbor_size': None }) return super().delete(obj_id)
class ContentGeneratorTest(JarrFlaskCommon): def setUp(self): super().setUp() self.actrl = ArticleController() article = self.actrl.read().first() ClusterController().delete(article.cluster_id, delete_articles=False) self.article = self.actrl.get(id=article.id) content_generator.get_content_generator.cache_clear() def set_truncated_content(self, **kwargs): kwargs.update({'truncated_content': True}) FeedController().update({'id': self.article.feed.id}, kwargs) @patch('jarr.controllers.article.ArticleController.enhance') def test_article_image_enhancement(self, enhance=None): self.actrl.update({'id': self.article.id}, { 'article_type': 'image', 'vector': None }) self.assertEqual(content_generator.ImageContentGenerator, self.article.content_generator.__class__) Clusterizer().main(self.article) self.assertEqual('image', self.article.cluster.content.get('type')) self.assertEqual(self.article.link, self.article.cluster.content.get('src')) self.assertEqual(1, enhance.call_count) self.assertIsNone(self.article.vector) @patch('jarr.controllers.article.ArticleController.enhance') def test_article_embedded_enhancement(self, enhance=None): self.actrl.update({'id': self.article.id}, { 'article_type': 'embedded', 'link': "https://www.youtube.com/" "watch?v=scbrjaqM3Oc", "vector": None }) self.assertEqual(content_generator.EmbeddedContentGenerator, self.article.content_generator.__class__) Clusterizer().main(self.article) self.assertEqual("embedded", self.article.cluster.content.get('type')) self.assertEqual("youtube", self.article.cluster.content.get('player')) self.assertEqual("scbrjaqM3Oc", self.article.cluster.content.get('videoId')) self.assertEqual(1, enhance.call_count) self.assertIsNone(self.article.vector) def test_article_image_enhancement_on_truncated(self): self.set_truncated_content() self.test_article_image_enhancement() def test_article_embedded_enhancement_on_truncated(self): self.set_truncated_content() self.test_article_embedded_enhancement() @patch('jarr.lib.content_generator.Goose') @patch('jarr.lib.content_generator.ContentGenerator._from_goose_to_html') def test_article_truncated_enhancement( self, from_goose=None, goose=None, cg=content_generator.TruncatedContentGenerator): from_goose.return_value = 'my collated content' patched_goose = Mock(opengraph={'locale': 'en'}, meta_lang='fr', final_url='my final url', meta_keywords='Monthy Python, Brian', tags=['The Holy Graal', 'Monthy Python'], title='Flying Circus', cleaned_text="Bring out your dead !") goose.return_value.extract.return_value = patched_goose self.set_truncated_content() self.assertEqual(cg, self.article.content_generator.__class__) self.article = self.actrl.get(id=self.article.id) Clusterizer().main(self.article) self.assertEqual(3, len(self.article.tags)) self.assertEqual('en', self.article.lang) self.assertEqual('Flying Circus', self.article.title) self.assertEqual('Flying Circus', self.article.cluster.main_title) self.assertEqual( { 'brian': 1, 'bring': 1, 'circus': 1, 'dead': 1, 'fli': 1, 'graal': 1, 'holi': 1, 'monthi': 1, 'python': 1 }, self.article.simple_vector) @patch('jarr.lib.content_generator.Goose') @patch('jarr.lib.content_generator.ContentGenerator._from_goose_to_html') def test_reddit_original_enhancement(self, from_goose, goose): self.set_truncated_content(feed_type='reddit') self.actrl.update({'id': self.article.id}, {'comments': self.article.link}) self.assertEqual(content_generator.RedditContentGenerator, self.article.content_generator.__class__) Clusterizer().main(self.article) self.assertEqual(0, from_goose.call_count) self.assertEqual(0, goose.call_count) self.assertEqual({}, self.article.cluster.content) @patch('jarr.lib.content_generator.Goose') @patch('jarr.lib.content_generator.ContentGenerator._from_goose_to_html') def test_reddit_image_link_enhancement(self, from_goose, goose): self.set_truncated_content(feed_type='reddit') self.actrl.update({'id': self.article.id}, {'article_type': 'image'}) self.test_article_image_enhancement() self.assertEqual(content_generator.ImageContentGenerator, self.article.content_generator.__class__) self.assertEqual(0, from_goose.call_count) self.assertEqual(0, goose.call_count) def test_reddit_truncated_enhancement(self): self.set_truncated_content(feed_type='reddit') self.test_article_truncated_enhancement( cg=content_generator.RedditContentGenerator)