示例#1
0
    def test_fetchable(self):
        fctrl = FeedController()
        total = fctrl.read().count()
        unix = datetime(1970, 1, 1).replace(tzinfo=timezone.utc)
        count = 0
        for fd in fctrl.list_late():
            count += 1
            self.assertEqual(unix, fd.last_retrieved)
            self.assertEqual(unix, fd.expires)
        self.assertEqual(total, count)

        fetchables = fctrl.list_fetchable()
        now = utc_now()
        for fd in fetchables:
            self.assert_in_range(now - timedelta(seconds=1), fd.last_retrieved,
                                 now)
            self.assertEqual(unix, fd.expires)
        self.assert_late_count(
            0, "no late feed to report because all just fetched")
        fctrl.update({}, {'expires': unix})
        now = utc_now()
        for fd in fctrl.read():  # expires should be corrected
            self.assert_in_range(
                now + timedelta(seconds=conf.feed.min_expires - 1), fd.expires,
                now + timedelta(seconds=conf.feed.min_expires + 1))

        lr_not_matter = timedelta(seconds=conf.feed.min_expires + 10)
        self.update_all_no_ctrl(expires=utc_now() - timedelta(seconds=1),
                                last_retrieved=utc_now() - lr_not_matter)
        self.assert_late_count(total, "all feed just expired")
        self.update_all_no_ctrl(expires=utc_now() + timedelta(seconds=1))
        self.assert_late_count(
            0, "all feed will expire in a second, none are expired")
示例#2
0
 def _test_unread_on_cluster(self, read_reason):
     ccontr = ClusterController()
     fcontr = FeedController()
     cluster = ccontr.read().first()
     clusterizer = Clusterizer()
     self.assertFalse(clusterizer.get_config(cluster, 'cluster_enabled'))
     self.assertTrue(clusterizer.get_config(cluster, 'cluster_wake_up'))
     ccontr.update({'id': cluster.id}, {
         'read': True,
         'read_reason': read_reason
     })
     target_feed = fcontr.read(id__ne=cluster.main_article.feed_id,
                               user_id=cluster.user_id).first()
     clusterizer = Clusterizer()
     self.assertFalse(clusterizer.get_config(target_feed,
                                             'cluster_enabled'))
     fcontr.update(
         {'id__in': [f.id for f in cluster.feeds] + [target_feed.id]}, {
             'cluster_wake_up': True,
             'cluster_enabled': True
         })
     clusterizer = Clusterizer()
     self.assertTrue(clusterizer.get_config(cluster, 'cluster_enabled'))
     target_feed = fcontr.read(id__ne=cluster.main_article.feed_id,
                               user_id=cluster.user_id).first()
     article = self._clone_article(ArticleController(),
                                   cluster.main_article, target_feed)
     clusterizer = Clusterizer()
     self.assertTrue(clusterizer.get_config(article, 'cluster_wake_up'))
     ClusterController(cluster.user_id).clusterize_pending_articles()
     self.assertEqual(2, len(article.cluster.articles))
     self.assertInCluster(article, cluster)
     return ccontr.get(id=cluster.id)
示例#3
0
    def test_time(self):
        naive = dateutil.parser.parse('2016-11-17T16:18:02.727802')
        aware = dateutil.parser.parse('2016-11-17T16:18:02.727802+00:00')
        aware2 = dateutil.parser.parse('2016-11-17T16:18:02.727802+12:00')
        fctrl = FeedController()
        fctrl.update({'id': 1}, {'last_retrieved': naive})
        self.assertEqual(fctrl.read(id=1).first().last_retrieved, aware)
        fctrl.update({'id': 1}, {'last_retrieved': aware})
        self.assertEqual(fctrl.read(id=1).first().last_retrieved, aware)

        fctrl.update({'id': 1}, {'last_retrieved': aware2})
        self.assertEqual(fctrl.read(id=1).first().last_retrieved, aware2)
        self.assertEqual(
            fctrl.read(id=1).first().last_retrieved,
            aware - timedelta(hours=12))
示例#4
0
def scheduler():
    logger.warning("Running scheduler")
    start = datetime.now()
    fctrl = FeedController()
    # browsing feeds to fetch
    feeds = list(fctrl.list_fetchable(conf.crawler.batch_size))
    WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds))
    logger.info('%d to enqueue', len(feeds))
    for feed in feeds:
        logger.debug("%r: scheduling to be fetched", feed)
        process_feed.apply_async(args=[feed.id])
    # browsing feeds to delete
    feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete))
    if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'):
        REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE)
        logger.info('%d to delete, deleting one', len(feeds_to_delete))
        for feed in feeds_to_delete:
            logger.debug("%r: scheduling to be delete", feed)
            feed_cleaner.apply_async(args=[feed.id])
            break  # only one at a time
    # applying clusterizer
    for user_id in ArticleController.get_user_id_with_pending_articles():
        if not UserController().get(id=user_id).effectivly_active:
            continue
        if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'):
            REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id,
                              conf.crawler.clusterizer_delay)
            clusterizer.apply_async(args=[user_id])
    scheduler.apply_async(countdown=conf.crawler.idle_delay)
    WORKER.labels(method='scheduler').observe(
        (datetime.now() - start).total_seconds())
    update_slow_metrics.apply_async()
示例#5
0
    def test_delete_main_cluster_handling(self):
        suffix = 'suffix'
        clu = ClusterController().get(id=10)
        acontr = ArticleController(clu.user_id)
        fcontr = FeedController(clu.user_id)
        old_title = clu.main_title
        old_feed_title, old_art_id = clu.main_feed_title, clu.main_article_id
        for art_to_del in acontr.read(link=clu.main_article.link,
                                      id__ne=clu.main_article.id):
            acontr.delete(art_to_del.id)

        other_feed = fcontr.read(id__ne=clu.main_article.feed_id).first()
        update_on_all_objs(articles=[clu.main_article],
                           feeds=[other_feed],
                           cluster_enabled=True)
        acontr.create(
            feed_id=other_feed.id,
            entry_id=clu.main_article.entry_id + suffix,
            link=clu.main_article.link,
            title=clu.main_article.title + suffix,
            content=clu.main_article.content + suffix,
            date=clu.main_article.date + timedelta(1),
            retrieved_date=clu.main_article.retrieved_date + timedelta(1),
        )

        ClusterController(clu.user_id).clusterize_pending_articles()
        clu = ClusterController().get(id=10)
        self.assertEqual(2, len(clu.articles))
        fcontr.delete(clu.main_article.feed_id)
        new_cluster = ClusterController(clu.user_id).get(id=clu.id)
        self.assertEqual(1, len(new_cluster.articles))
        self.assertNotEqual(old_title, new_cluster.main_title)
        self.assertNotEqual(old_feed_title, new_cluster.main_feed_title)
        self.assertNotEqual(old_art_id, new_cluster.main_article_id)
示例#6
0
    def test_adding_to_cluster_by_link(self):
        ccontr = ClusterController()

        cluster = ccontr.read().first()
        ccontr.update({'id': cluster.id}, {
            'read': True,
            'read_reason': 'marked'
        })
        cluster = ccontr.get(id=cluster.id)
        self.assertTrue(cluster.read)
        article = cluster.articles[0]
        articles_count = len(cluster.articles)

        fcontr = FeedController(cluster.user_id)
        acontr = ArticleController(cluster.user_id)
        fcontr.update({'id': article.feed_id}, {'cluster_wake_up': True})
        feed = fcontr.read(id__ne=article.feed_id).first()
        update_on_all_objs(articles=[article],
                           feeds=[feed],
                           cluster_enabled=True)

        self._clone_article(acontr, article, feed)
        ccontr.clusterize_pending_articles()

        cluster = ccontr.get(id=cluster.id)
        self.assertEqual(articles_count + 1, len(cluster.articles))
        self.assertFalse(cluster.read)
示例#7
0
    def test_scheduler(self):
        scheduler()
        UserController().update({}, {'last_connection': utc_now()})
        fctrl = FeedController()

        epoch = datetime(1970, 1, 1, tzinfo=timezone.utc)
        self.assertEqual(fctrl.read().count(),
                         self.process_feed_patch.apply_async.call_count)
        self.assertEqual(0, self.clusteriser_patch.apply_async.call_count)
        self.assertEqual(0, self.feed_cleaner_patch.apply_async.call_count)
        feed1, feed2, feed3 = list(FeedController().read().limit(3))
        FeedController().update({'id__in': [feed1.id, feed3.id]},
                                {'status': 'to_delete'})
        FeedController().update({'id': feed2.id}, {
            'last_retrieved': epoch,
            'expires': epoch
        })
        self.assertEqual(1, len(list(fctrl.list_fetchable())))
        scheduler()
        self.assertEqual(fctrl.read().count(),
                         self.process_feed_patch.apply_async.call_count)
        self.assertEqual(0, self.clusteriser_patch.apply_async.call_count)
        self.assertEqual(1, self.feed_cleaner_patch.apply_async.call_count)
示例#8
0
    def post():
        opml_file = request.files['opml_file']

        try:
            subscriptions = opml.from_string(opml_file.read())
        except Exception as error:
            raise UnprocessableEntity("Couldn't parse OPML file (%r)" % error)

        ccontr = CategoryController(current_identity.id)
        fcontr = FeedController(current_identity.id)
        counts = {'created': 0, 'existing': 0, 'failed': 0, 'exceptions': []}
        categories = {cat.name: cat.id for cat in ccontr.read()}
        for line in subscriptions:
            try:
                link = line.xmlUrl
            except Exception as error:
                counts['failed'] += 1
                counts['exceptions'].append(str(error))
                continue

            # don't import twice
            if fcontr.read(link=link).count():
                counts['existing'] += 1
                continue

            # handling categories
            cat_id = None
            category = getattr(line, 'category', '').lstrip('/')
            if category:
                if category not in categories:
                    new_category = ccontr.create(name=category)
                    categories[new_category.name] = new_category.id
                cat_id = categories[category]

            fcontr.create(title=getattr(line, 'text', None),
                          category_id=cat_id,
                          description=getattr(line, 'description', None),
                          link=link,
                          site_link=getattr(line, 'htmlUrl', None))
            counts['created'] += 1
        code = 200
        if counts.get('created'):
            code = 201
        elif counts.get('failed'):
            code = 400
        return counts, code
示例#9
0
    def _test_fetching_anti_herding_mech(self, now):
        fctrl = FeedController()
        total = fctrl.read().count()

        half = timedelta(seconds=conf.feed.min_expires / 2)
        twice = timedelta(seconds=conf.feed.min_expires * 2)
        long_ago = timedelta(seconds=conf.feed.max_expires * 2)

        self.update_all_no_ctrl(expires=now + half, last_retrieved=now)
        self.assert_late_count(0, "all have just been retrieved, none expired")
        self.update_all_no_ctrl(expires=now - twice, last_retrieved=now - half)
        self.assert_late_count(0, "have been retrieved not too long ago")

        self.update_all_no_ctrl(expires=now + twice,
                                last_retrieved=now - long_ago)
        self.assert_late_count(total,
                               "all retrieved some time ago, not expired")
示例#10
0
    def get():
        """
        Construct a feed from (any) url.

        Returns
        -------
        feed:
            a dictionnary with most of what's needed to contruct a feed
            plus alternative links found during parsing

        """
        code = 406
        url = url_parser.parse_args()['url']
        feed = FeedBuilderController(url).construct()
        if feed.get('link'):
            code = 200
            fctrl = FeedController(current_identity.id)
            feed['same_link_count'] = fctrl.read(link=feed.get('link')).count()
        return feed, code
示例#11
0
文件: main.py 项目: jaesivsm/JARR
def scheduler():
    logger.warning("Running scheduler")
    start = datetime.now()
    fctrl = FeedController()
    # browsing feeds to fetch
    queue = Queues.CRAWLING if conf.crawler.use_queues else Queues.DEFAULT
    feeds = list(fctrl.list_fetchable(conf.crawler.batch_size))
    WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds))
    logger.info('%d to enqueue', len(feeds))
    for feed in feeds:
        logger.debug("%r: scheduling to be fetched on queue:%r",
                     feed, queue.value)
        process_feed.apply_async(args=[feed.id], queue=queue.value)
    # browsing feeds to delete
    feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete))
    if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'):
        REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE)
        logger.info('%d to delete, deleting one', len(feeds_to_delete))
        for feed in feeds_to_delete:
            logger.debug("%r: scheduling to be delete", feed)
            feed_cleaner.apply_async(args=[feed.id])
    # applying clusterizer
    queue = Queues.CLUSTERING if conf.crawler.use_queues else Queues.DEFAULT
    for user_id in ArticleController.get_user_id_with_pending_articles():
        if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'):
            REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id,
                              conf.crawler.clusterizer_delay)
            logger.debug('Scheduling clusterizer for User(%d) on queue:%r',
                         user_id, queue.value)
            clusterizer.apply_async(args=[user_id], queue=queue.value)
    scheduler.apply_async(countdown=conf.crawler.idle_delay)
    metrics_users_any.apply_async()
    metrics_users_active.apply_async()
    metrics_users_long_term.apply_async()
    metrics_articles_unclustered.apply_async()
    observe_worker_result_since(start, 'scheduler', 'ok')
示例#12
0
    def _test_create_using_filters(self):
        # FIXME wait redo filters
        feed_ctr = FeedController(USER_ID)
        acontr = ArticleController(USER_ID)
        feed1, feed2, feed3 = [f for f in feed_ctr.read()][0:3]
        feed_ctr.update({'id': feed3.id}, {
            'cluster_enabled':
            True,
            'filters': [{
                "type": "regex",
                "pattern": ".*(pattern1|pattern2).*",
                "action on": "no match",
                "action": "mark as favorite"
            }, {
                "type": "simple match",
                "pattern": "pattern3",
                "action on": "match",
                "action": "mark as read"
            }]
        })
        feed_ctr.update({'id': feed1.id}, {
            'filters': [{
                "type": "simple match",
                "pattern": "pattern3",
                "action on": "match",
                "action": "mark as read"
            }]
        })
        feed_ctr.update({'id': feed2.id}, {
            'filters': [{
                "type": "tag match",
                "pattern": "pattern4",
                "action on": "match",
                "action": "skipped"
            }, {
                "type": "tag contains",
                "pattern": "pattern5",
                "action on": "match",
                "action": "skipped"
            }]
        })

        art1 = acontr.create(entry_id="will be read and faved 1",
                             feed_id=feed1.id,
                             title="garbage pattern1 pattern3 garbage",
                             content="doesn't matter",
                             link="cluster1")

        art2 = acontr.create(entry_id="will be ignored 2",
                             feed_id=feed1.id,
                             title="garbage see pattern garbage",
                             content="doesn't matter2",
                             link="is ignored 2")

        art3 = acontr.create(entry_id="will be read 3",
                             user_id=2,
                             feed_id=feed2.id,
                             title="garbage pattern3 garbage",
                             content="doesn't matter",
                             link="doesn't matter either3")

        art4 = acontr.create(entry_id="will be ignored 4",
                             user_id=2,
                             feed_id=feed2.id,
                             title="garbage see pattern garbage",
                             content="doesn't matter2",
                             link="doesn't matter either4")

        art5 = acontr.create(entry_id="will be faved 5",
                             feed_id=feed3.id,
                             title="garbage anti-attern3 garbage",
                             content="doesn't matter",
                             link="cluster1")
        art6 = acontr.create(entry_id="will be faved 6",
                             feed_id=feed3.id,
                             title="garbage pattern1 garbage",
                             content="doesn't matter2",
                             link="doesn't matter 6")
        art7 = acontr.create(entry_id="will be read 7",
                             feed_id=feed3.id,
                             title="garbage pattern3 garbage",
                             content="doesn't matter3",
                             link="doesn't matter either7")

        art8 = acontr.create(entry_id="will be ignored",
                             feed_id=feed3.id,
                             title="garbage pattern4 garbage",
                             content="doesn't matter4-matter4_matter4",
                             lang='fa_ke',
                             link="doesn't matter either8")

        art9 = acontr.create(entry_id="unique9",
                             feed_id=feed2.id,
                             title="garbage",
                             tags=['garbage', 'pattern4'],
                             content="doesn't matterç",
                             link="doesn't matter either9")

        art10 = acontr.create(entry_id="will be ignored",
                              feed_id=feed2.id,
                              title="garbage",
                              tags=['pattern5 garbage', 'garbage'],
                              content="doesn't matter10",
                              link="doesn't matter either10")

        ClusterController(USER_ID).clusterize_pending_articles()

        self.assertTrue(acontr.get(id=art1.id).cluster.read)
        self.assertFalse(acontr.get(id=art1.id).cluster.liked)
        self.assertFalse(acontr.get(id=art2.id).cluster.read)
        self.assertFalse(acontr.get(id=art2.id).cluster.liked)
        self.assertFalse(acontr.get(id=art3.id).cluster.read)
        self.assertFalse(acontr.get(id=art3.id).cluster.liked)
        self.assertFalse(acontr.get(id=art4.id).cluster.read)
        self.assertFalse(acontr.get(id=art4.id).cluster.liked)
        self.assertTrue(art5.cluster.read,
                        "should be read because it clustered")
        self.assertTrue(art5.cluster.liked)
        self.assertFalse(art6.cluster.read)
        self.assertFalse(art6.cluster.liked)
        self.assertTrue(art7.cluster.read)
        self.assertTrue(art7.cluster.liked)
        self.assertFalse(art8.cluster.read)
        self.assertTrue(art8.cluster.liked)
        self.assertIsNone(art9)
        self.assertEqual(0, acontr.read(entry_id='unique9').count())
        self.assertIsNone(art10)
        self.assertEqual(0, acontr.read(entry_id='unique10').count())
示例#13
0
 def test_delete(self):
     feed_ctrl = FeedController()
     for feed in feed_ctrl.read():
         feed_ctrl.delete(feed.id)
     self.assertEqual(0, ClusterController(2).read().count())
     self.assertEqual(0, ArticleController(2).read().count())
示例#14
0
class OPMLTest(JarrFlaskCommon):
    def setUp(self):
        super().setUp()
        login = '******'
        self.user = UserController().get(login=login)
        self.user2 = UserController().get(login='******')
        self.fctrl = FeedController(self.user.id)
        self.cctrl = CategoryController(self.user.id)
        self.uctrl = UserController()

    def test_opml_dump_and_restore(self):
        # downloading OPML export file
        resp = self.jarr_client('get', '/opml', user=self.user.login)
        self.assertStatusCode(200, resp)
        opml_dump = resp.data.decode()
        self.assertTrue(
            opml_dump.startswith('<?xml version="1.0" encoding="utf-8"'))
        self.assertTrue(opml_dump.endswith('</opml>'))
        # cleaning db
        actrl = ArticleController(self.user.id)
        for item in actrl.read():
            actrl.delete(item.id)
        self.assertEqual(0, ClusterController(self.user.id).read().count())
        self.assertEqual(0, ArticleController(self.user.id).read().count())
        no_category_feed = []
        existing_feeds = {}
        for feed in self.fctrl.read():
            if feed.category:
                if feed.category.name in existing_feeds:
                    existing_feeds[feed.category.name].append(feed.title)
                else:
                    existing_feeds[feed.category.name] = [feed.title]
            else:
                no_category_feed.append(feed.title)

            self.fctrl.delete(feed.id)
        for category in self.cctrl.read():
            self.cctrl.delete(category.id)
        # re-importing OPML
        import_resp = self.jarr_client(
            'post',
            'opml',
            to_json=False,
            data={'opml_file': (BytesIO(resp.data), 'opml.xml')},
            headers=None,
            user=self.user.login)
        self.assertStatusCode(201, import_resp)
        self.assertEqual(0, import_resp.json['existing'])
        self.assertEqual(0, import_resp.json['failed'])
        self._check_opml_imported(existing_feeds, no_category_feed)

        import_resp = self.jarr_client(
            'post',
            'opml',
            to_json=False,
            data={'opml_file': (BytesIO(resp.data), 'opml.xml')},
            headers=None,
            user=self.user.login)
        self.assertStatusCode(200, import_resp)
        self.assertEqual(0, import_resp.json['created'])
        self.assertEqual(0, import_resp.json['failed'])

    def _check_opml_imported(self, existing_feeds, no_category_feed):
        self.assertEqual(
            sum(map(len, existing_feeds.values())) + len(no_category_feed),
            self.fctrl.read().count())
        self.assertEqual(len(existing_feeds), self.cctrl.read().count())
        for feed in self.fctrl.read():
            if feed.category:
                self.assertIn(feed.category.name, existing_feeds)
                self.assertIn(feed.title, existing_feeds[feed.category.name])
            else:
                self.assertIn(feed.title, no_category_feed)