def test_fetchable(self): fctrl = FeedController() total = fctrl.read().count() unix = datetime(1970, 1, 1).replace(tzinfo=timezone.utc) count = 0 for fd in fctrl.list_late(): count += 1 self.assertEqual(unix, fd.last_retrieved) self.assertEqual(unix, fd.expires) self.assertEqual(total, count) fetchables = fctrl.list_fetchable() now = utc_now() for fd in fetchables: self.assert_in_range(now - timedelta(seconds=1), fd.last_retrieved, now) self.assertEqual(unix, fd.expires) self.assert_late_count( 0, "no late feed to report because all just fetched") fctrl.update({}, {'expires': unix}) now = utc_now() for fd in fctrl.read(): # expires should be corrected self.assert_in_range( now + timedelta(seconds=conf.feed.min_expires - 1), fd.expires, now + timedelta(seconds=conf.feed.min_expires + 1)) lr_not_matter = timedelta(seconds=conf.feed.min_expires + 10) self.update_all_no_ctrl(expires=utc_now() - timedelta(seconds=1), last_retrieved=utc_now() - lr_not_matter) self.assert_late_count(total, "all feed just expired") self.update_all_no_ctrl(expires=utc_now() + timedelta(seconds=1)) self.assert_late_count( 0, "all feed will expire in a second, none are expired")
def _test_unread_on_cluster(self, read_reason): ccontr = ClusterController() fcontr = FeedController() cluster = ccontr.read().first() clusterizer = Clusterizer() self.assertFalse(clusterizer.get_config(cluster, 'cluster_enabled')) self.assertTrue(clusterizer.get_config(cluster, 'cluster_wake_up')) ccontr.update({'id': cluster.id}, { 'read': True, 'read_reason': read_reason }) target_feed = fcontr.read(id__ne=cluster.main_article.feed_id, user_id=cluster.user_id).first() clusterizer = Clusterizer() self.assertFalse(clusterizer.get_config(target_feed, 'cluster_enabled')) fcontr.update( {'id__in': [f.id for f in cluster.feeds] + [target_feed.id]}, { 'cluster_wake_up': True, 'cluster_enabled': True }) clusterizer = Clusterizer() self.assertTrue(clusterizer.get_config(cluster, 'cluster_enabled')) target_feed = fcontr.read(id__ne=cluster.main_article.feed_id, user_id=cluster.user_id).first() article = self._clone_article(ArticleController(), cluster.main_article, target_feed) clusterizer = Clusterizer() self.assertTrue(clusterizer.get_config(article, 'cluster_wake_up')) ClusterController(cluster.user_id).clusterize_pending_articles() self.assertEqual(2, len(article.cluster.articles)) self.assertInCluster(article, cluster) return ccontr.get(id=cluster.id)
def test_time(self): naive = dateutil.parser.parse('2016-11-17T16:18:02.727802') aware = dateutil.parser.parse('2016-11-17T16:18:02.727802+00:00') aware2 = dateutil.parser.parse('2016-11-17T16:18:02.727802+12:00') fctrl = FeedController() fctrl.update({'id': 1}, {'last_retrieved': naive}) self.assertEqual(fctrl.read(id=1).first().last_retrieved, aware) fctrl.update({'id': 1}, {'last_retrieved': aware}) self.assertEqual(fctrl.read(id=1).first().last_retrieved, aware) fctrl.update({'id': 1}, {'last_retrieved': aware2}) self.assertEqual(fctrl.read(id=1).first().last_retrieved, aware2) self.assertEqual( fctrl.read(id=1).first().last_retrieved, aware - timedelta(hours=12))
def scheduler(): logger.warning("Running scheduler") start = datetime.now() fctrl = FeedController() # browsing feeds to fetch feeds = list(fctrl.list_fetchable(conf.crawler.batch_size)) WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds)) logger.info('%d to enqueue', len(feeds)) for feed in feeds: logger.debug("%r: scheduling to be fetched", feed) process_feed.apply_async(args=[feed.id]) # browsing feeds to delete feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete)) if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'): REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE) logger.info('%d to delete, deleting one', len(feeds_to_delete)) for feed in feeds_to_delete: logger.debug("%r: scheduling to be delete", feed) feed_cleaner.apply_async(args=[feed.id]) break # only one at a time # applying clusterizer for user_id in ArticleController.get_user_id_with_pending_articles(): if not UserController().get(id=user_id).effectivly_active: continue if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'): REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id, conf.crawler.clusterizer_delay) clusterizer.apply_async(args=[user_id]) scheduler.apply_async(countdown=conf.crawler.idle_delay) WORKER.labels(method='scheduler').observe( (datetime.now() - start).total_seconds()) update_slow_metrics.apply_async()
def test_delete_main_cluster_handling(self): suffix = 'suffix' clu = ClusterController().get(id=10) acontr = ArticleController(clu.user_id) fcontr = FeedController(clu.user_id) old_title = clu.main_title old_feed_title, old_art_id = clu.main_feed_title, clu.main_article_id for art_to_del in acontr.read(link=clu.main_article.link, id__ne=clu.main_article.id): acontr.delete(art_to_del.id) other_feed = fcontr.read(id__ne=clu.main_article.feed_id).first() update_on_all_objs(articles=[clu.main_article], feeds=[other_feed], cluster_enabled=True) acontr.create( feed_id=other_feed.id, entry_id=clu.main_article.entry_id + suffix, link=clu.main_article.link, title=clu.main_article.title + suffix, content=clu.main_article.content + suffix, date=clu.main_article.date + timedelta(1), retrieved_date=clu.main_article.retrieved_date + timedelta(1), ) ClusterController(clu.user_id).clusterize_pending_articles() clu = ClusterController().get(id=10) self.assertEqual(2, len(clu.articles)) fcontr.delete(clu.main_article.feed_id) new_cluster = ClusterController(clu.user_id).get(id=clu.id) self.assertEqual(1, len(new_cluster.articles)) self.assertNotEqual(old_title, new_cluster.main_title) self.assertNotEqual(old_feed_title, new_cluster.main_feed_title) self.assertNotEqual(old_art_id, new_cluster.main_article_id)
def test_adding_to_cluster_by_link(self): ccontr = ClusterController() cluster = ccontr.read().first() ccontr.update({'id': cluster.id}, { 'read': True, 'read_reason': 'marked' }) cluster = ccontr.get(id=cluster.id) self.assertTrue(cluster.read) article = cluster.articles[0] articles_count = len(cluster.articles) fcontr = FeedController(cluster.user_id) acontr = ArticleController(cluster.user_id) fcontr.update({'id': article.feed_id}, {'cluster_wake_up': True}) feed = fcontr.read(id__ne=article.feed_id).first() update_on_all_objs(articles=[article], feeds=[feed], cluster_enabled=True) self._clone_article(acontr, article, feed) ccontr.clusterize_pending_articles() cluster = ccontr.get(id=cluster.id) self.assertEqual(articles_count + 1, len(cluster.articles)) self.assertFalse(cluster.read)
def test_scheduler(self): scheduler() UserController().update({}, {'last_connection': utc_now()}) fctrl = FeedController() epoch = datetime(1970, 1, 1, tzinfo=timezone.utc) self.assertEqual(fctrl.read().count(), self.process_feed_patch.apply_async.call_count) self.assertEqual(0, self.clusteriser_patch.apply_async.call_count) self.assertEqual(0, self.feed_cleaner_patch.apply_async.call_count) feed1, feed2, feed3 = list(FeedController().read().limit(3)) FeedController().update({'id__in': [feed1.id, feed3.id]}, {'status': 'to_delete'}) FeedController().update({'id': feed2.id}, { 'last_retrieved': epoch, 'expires': epoch }) self.assertEqual(1, len(list(fctrl.list_fetchable()))) scheduler() self.assertEqual(fctrl.read().count(), self.process_feed_patch.apply_async.call_count) self.assertEqual(0, self.clusteriser_patch.apply_async.call_count) self.assertEqual(1, self.feed_cleaner_patch.apply_async.call_count)
def post(): opml_file = request.files['opml_file'] try: subscriptions = opml.from_string(opml_file.read()) except Exception as error: raise UnprocessableEntity("Couldn't parse OPML file (%r)" % error) ccontr = CategoryController(current_identity.id) fcontr = FeedController(current_identity.id) counts = {'created': 0, 'existing': 0, 'failed': 0, 'exceptions': []} categories = {cat.name: cat.id for cat in ccontr.read()} for line in subscriptions: try: link = line.xmlUrl except Exception as error: counts['failed'] += 1 counts['exceptions'].append(str(error)) continue # don't import twice if fcontr.read(link=link).count(): counts['existing'] += 1 continue # handling categories cat_id = None category = getattr(line, 'category', '').lstrip('/') if category: if category not in categories: new_category = ccontr.create(name=category) categories[new_category.name] = new_category.id cat_id = categories[category] fcontr.create(title=getattr(line, 'text', None), category_id=cat_id, description=getattr(line, 'description', None), link=link, site_link=getattr(line, 'htmlUrl', None)) counts['created'] += 1 code = 200 if counts.get('created'): code = 201 elif counts.get('failed'): code = 400 return counts, code
def _test_fetching_anti_herding_mech(self, now): fctrl = FeedController() total = fctrl.read().count() half = timedelta(seconds=conf.feed.min_expires / 2) twice = timedelta(seconds=conf.feed.min_expires * 2) long_ago = timedelta(seconds=conf.feed.max_expires * 2) self.update_all_no_ctrl(expires=now + half, last_retrieved=now) self.assert_late_count(0, "all have just been retrieved, none expired") self.update_all_no_ctrl(expires=now - twice, last_retrieved=now - half) self.assert_late_count(0, "have been retrieved not too long ago") self.update_all_no_ctrl(expires=now + twice, last_retrieved=now - long_ago) self.assert_late_count(total, "all retrieved some time ago, not expired")
def get(): """ Construct a feed from (any) url. Returns ------- feed: a dictionnary with most of what's needed to contruct a feed plus alternative links found during parsing """ code = 406 url = url_parser.parse_args()['url'] feed = FeedBuilderController(url).construct() if feed.get('link'): code = 200 fctrl = FeedController(current_identity.id) feed['same_link_count'] = fctrl.read(link=feed.get('link')).count() return feed, code
def scheduler(): logger.warning("Running scheduler") start = datetime.now() fctrl = FeedController() # browsing feeds to fetch queue = Queues.CRAWLING if conf.crawler.use_queues else Queues.DEFAULT feeds = list(fctrl.list_fetchable(conf.crawler.batch_size)) WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds)) logger.info('%d to enqueue', len(feeds)) for feed in feeds: logger.debug("%r: scheduling to be fetched on queue:%r", feed, queue.value) process_feed.apply_async(args=[feed.id], queue=queue.value) # browsing feeds to delete feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete)) if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'): REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE) logger.info('%d to delete, deleting one', len(feeds_to_delete)) for feed in feeds_to_delete: logger.debug("%r: scheduling to be delete", feed) feed_cleaner.apply_async(args=[feed.id]) # applying clusterizer queue = Queues.CLUSTERING if conf.crawler.use_queues else Queues.DEFAULT for user_id in ArticleController.get_user_id_with_pending_articles(): if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'): REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id, conf.crawler.clusterizer_delay) logger.debug('Scheduling clusterizer for User(%d) on queue:%r', user_id, queue.value) clusterizer.apply_async(args=[user_id], queue=queue.value) scheduler.apply_async(countdown=conf.crawler.idle_delay) metrics_users_any.apply_async() metrics_users_active.apply_async() metrics_users_long_term.apply_async() metrics_articles_unclustered.apply_async() observe_worker_result_since(start, 'scheduler', 'ok')
def _test_create_using_filters(self): # FIXME wait redo filters feed_ctr = FeedController(USER_ID) acontr = ArticleController(USER_ID) feed1, feed2, feed3 = [f for f in feed_ctr.read()][0:3] feed_ctr.update({'id': feed3.id}, { 'cluster_enabled': True, 'filters': [{ "type": "regex", "pattern": ".*(pattern1|pattern2).*", "action on": "no match", "action": "mark as favorite" }, { "type": "simple match", "pattern": "pattern3", "action on": "match", "action": "mark as read" }] }) feed_ctr.update({'id': feed1.id}, { 'filters': [{ "type": "simple match", "pattern": "pattern3", "action on": "match", "action": "mark as read" }] }) feed_ctr.update({'id': feed2.id}, { 'filters': [{ "type": "tag match", "pattern": "pattern4", "action on": "match", "action": "skipped" }, { "type": "tag contains", "pattern": "pattern5", "action on": "match", "action": "skipped" }] }) art1 = acontr.create(entry_id="will be read and faved 1", feed_id=feed1.id, title="garbage pattern1 pattern3 garbage", content="doesn't matter", link="cluster1") art2 = acontr.create(entry_id="will be ignored 2", feed_id=feed1.id, title="garbage see pattern garbage", content="doesn't matter2", link="is ignored 2") art3 = acontr.create(entry_id="will be read 3", user_id=2, feed_id=feed2.id, title="garbage pattern3 garbage", content="doesn't matter", link="doesn't matter either3") art4 = acontr.create(entry_id="will be ignored 4", user_id=2, feed_id=feed2.id, title="garbage see pattern garbage", content="doesn't matter2", link="doesn't matter either4") art5 = acontr.create(entry_id="will be faved 5", feed_id=feed3.id, title="garbage anti-attern3 garbage", content="doesn't matter", link="cluster1") art6 = acontr.create(entry_id="will be faved 6", feed_id=feed3.id, title="garbage pattern1 garbage", content="doesn't matter2", link="doesn't matter 6") art7 = acontr.create(entry_id="will be read 7", feed_id=feed3.id, title="garbage pattern3 garbage", content="doesn't matter3", link="doesn't matter either7") art8 = acontr.create(entry_id="will be ignored", feed_id=feed3.id, title="garbage pattern4 garbage", content="doesn't matter4-matter4_matter4", lang='fa_ke', link="doesn't matter either8") art9 = acontr.create(entry_id="unique9", feed_id=feed2.id, title="garbage", tags=['garbage', 'pattern4'], content="doesn't matterç", link="doesn't matter either9") art10 = acontr.create(entry_id="will be ignored", feed_id=feed2.id, title="garbage", tags=['pattern5 garbage', 'garbage'], content="doesn't matter10", link="doesn't matter either10") ClusterController(USER_ID).clusterize_pending_articles() self.assertTrue(acontr.get(id=art1.id).cluster.read) self.assertFalse(acontr.get(id=art1.id).cluster.liked) self.assertFalse(acontr.get(id=art2.id).cluster.read) self.assertFalse(acontr.get(id=art2.id).cluster.liked) self.assertFalse(acontr.get(id=art3.id).cluster.read) self.assertFalse(acontr.get(id=art3.id).cluster.liked) self.assertFalse(acontr.get(id=art4.id).cluster.read) self.assertFalse(acontr.get(id=art4.id).cluster.liked) self.assertTrue(art5.cluster.read, "should be read because it clustered") self.assertTrue(art5.cluster.liked) self.assertFalse(art6.cluster.read) self.assertFalse(art6.cluster.liked) self.assertTrue(art7.cluster.read) self.assertTrue(art7.cluster.liked) self.assertFalse(art8.cluster.read) self.assertTrue(art8.cluster.liked) self.assertIsNone(art9) self.assertEqual(0, acontr.read(entry_id='unique9').count()) self.assertIsNone(art10) self.assertEqual(0, acontr.read(entry_id='unique10').count())
def test_delete(self): feed_ctrl = FeedController() for feed in feed_ctrl.read(): feed_ctrl.delete(feed.id) self.assertEqual(0, ClusterController(2).read().count()) self.assertEqual(0, ArticleController(2).read().count())
class OPMLTest(JarrFlaskCommon): def setUp(self): super().setUp() login = '******' self.user = UserController().get(login=login) self.user2 = UserController().get(login='******') self.fctrl = FeedController(self.user.id) self.cctrl = CategoryController(self.user.id) self.uctrl = UserController() def test_opml_dump_and_restore(self): # downloading OPML export file resp = self.jarr_client('get', '/opml', user=self.user.login) self.assertStatusCode(200, resp) opml_dump = resp.data.decode() self.assertTrue( opml_dump.startswith('<?xml version="1.0" encoding="utf-8"')) self.assertTrue(opml_dump.endswith('</opml>')) # cleaning db actrl = ArticleController(self.user.id) for item in actrl.read(): actrl.delete(item.id) self.assertEqual(0, ClusterController(self.user.id).read().count()) self.assertEqual(0, ArticleController(self.user.id).read().count()) no_category_feed = [] existing_feeds = {} for feed in self.fctrl.read(): if feed.category: if feed.category.name in existing_feeds: existing_feeds[feed.category.name].append(feed.title) else: existing_feeds[feed.category.name] = [feed.title] else: no_category_feed.append(feed.title) self.fctrl.delete(feed.id) for category in self.cctrl.read(): self.cctrl.delete(category.id) # re-importing OPML import_resp = self.jarr_client( 'post', 'opml', to_json=False, data={'opml_file': (BytesIO(resp.data), 'opml.xml')}, headers=None, user=self.user.login) self.assertStatusCode(201, import_resp) self.assertEqual(0, import_resp.json['existing']) self.assertEqual(0, import_resp.json['failed']) self._check_opml_imported(existing_feeds, no_category_feed) import_resp = self.jarr_client( 'post', 'opml', to_json=False, data={'opml_file': (BytesIO(resp.data), 'opml.xml')}, headers=None, user=self.user.login) self.assertStatusCode(200, import_resp) self.assertEqual(0, import_resp.json['created']) self.assertEqual(0, import_resp.json['failed']) def _check_opml_imported(self, existing_feeds, no_category_feed): self.assertEqual( sum(map(len, existing_feeds.values())) + len(no_category_feed), self.fctrl.read().count()) self.assertEqual(len(existing_feeds), self.cctrl.read().count()) for feed in self.fctrl.read(): if feed.category: self.assertIn(feed.category.name, existing_feeds) self.assertIn(feed.title, existing_feeds[feed.category.name]) else: self.assertIn(feed.title, no_category_feed)