Exemplo n.º 1
0
 def test_parsing_valid_feeds(self, feed):
     note(feed.feed)
     note(feed.items)
     with patch(
             "urllib.request.OpenerDirector.open",
             new=partial(self.patch_open, feed),
     ):
         contents, errors = feedergrabber(feed.feed["link"])
         if contents is None:
             note(errors)
             self.assertEqual(0, len(feed.items))
             self.assertEqual(1, len(errors))
             self.assertIn("Parsing methods not successful", errors[0])
         else:
             for i, (link, title, date, content) in enumerate(contents):
                 item = feed.items[i]
                 self.assertEqual(link, item["link"])
                 item_date = item.get("pubdate", item.get("updateddate"))
                 note(item_date)
                 note(date)
                 self.assertIsNotNone(date)
                 self.assertGreaterEqual(
                     datetime.datetime.now().utctimetuple(),
                     date.utctimetuple(),
                 )
Exemplo n.º 2
0
 def test_parsing_feeds_with_min_dates(self):
     with patch("urllib2.OpenerDirector.open", new=self.min_date_feed):
         contents, errors = feedergrabber("http://max.computer/index.html")
         self.assertIsNone(contents)
         self.assertEqual(2, len(errors))
         self.assertIn("Parsing methods not successful", errors[-1])
         self.assertIn("hugo page", errors[0])
Exemplo n.º 3
0
    def crawlblog(self, blog):
        # Feedergrabber returns ( [(link, title, date, content)], [errors])
        crawled, errors = feedergrabber27.feedergrabber(blog.feed_url)
        if not crawled:
            log.debug('\n'.join(errors))
            return

        log.debug('Crawled %s posts from %s', len(crawled), blog.feed_url)
        if errors:
            log.debug('\n'.join(errors))
        blog.last_crawled = timezone.now()
        blog.save(update_fields=['last_crawled'])
        created_count = 0
        for link, title, date, content in crawled:
            date = timezone.make_aware(date, timezone.get_default_timezone())
            # create the post instance if it doesn't already exist
            post, created = get_or_create_post(
                blog, title, link, date, content
            )
            if created:
                created_count += 1
                log.debug("Created '%s' from blog '%s'", title, blog.feed_url)
                # Throttle the amount of new posts that can be announced per
                # user per crawl.
                if created_count <= settings.MAX_POST_ANNOUNCE:
                    self.zulip_queue.append(post)
            else:
                update_post(post, title, link, content)
Exemplo n.º 4
0
    def crawlblog(self, blog):
        # Feedergrabber returns ( [(link, title, date, content)], [errors])
        print(f"Crawling {blog.feed_url} ...")
        crawled, errors = feedergrabber27.feedergrabber(blog.feed_url)
        if not crawled:
            log.debug("\n".join(errors))
            return

        log.debug("Crawled %s posts from %s", len(crawled), blog.feed_url)
        if errors:
            log.debug("\n".join(errors))
        blog.last_crawled = timezone.now()
        blog.save(update_fields=["last_crawled"])
        created_count = 0
        for link, title, date, content in crawled:
            date = timezone.make_aware(date, timezone.get_default_timezone())
            # create the post instance if it doesn't already exist
            post, created = get_or_create_post(blog, title, link, date,
                                               content)
            if created:
                created_count += 1
                log.debug("Created '%s' from blog '%s'", title, blog.feed_url)
                # Throttle the amount of new posts that can be announced per
                # user per crawl.
                if created_count <= settings.MAX_POST_ANNOUNCE:
                    self.zulip_queue.append(post)
            else:
                update_post(post, title, link, content)
Exemplo n.º 5
0
 def test_parsing_feeds_with_min_dates(self):
     with patch("urllib2.OpenerDirector.open", new=self.min_date_feed):
         contents, errors = feedergrabber("http://max.computer/index.html")
         self.assertIsNone(contents)
         self.assertEqual(2, len(errors))
         self.assertIn("Parsing methods not successful", errors[-1])
         self.assertIn("hugo page", errors[0])
Exemplo n.º 6
0
    def crawlblog(self, blog):

        # Feedergrabber returns ( [(link, title, date)], [errors])
        # We're ignoring the errors returned for right now
        crawled, errors = feedergrabber27.feedergrabber(blog.feed_url)

        if crawled:

            for link, title, date in crawled:

                date = timezone.make_aware(date, timezone.get_default_timezone())
                now = timezone.make_aware(datetime.datetime.now(), timezone.get_default_timezone())

                title = cleantitle(title)

                # create the post instance if it doesn't already exist
                post, created = Post.objects.get_or_create(
                    blog = blog,
                    url = link,
                    defaults = {
                        'title': title,
                        'date_updated': date,
                    }
                )

                if created:
                    print "Created '%s' from blog '%s'" % (title, blog.feed_url)
                    # Only post to zulip if the post was created recently
                    #   so that new accounts don't spam zulip with their entire post list
                    if (now - date) < max_zulip_age:
                        post_page = ROOT_URL + 'post/' + Post.objects.get(url=link).slug
                        send_message_zulip(user=blog.user, link=post_page, title=title)
                        
                    # subscribe the author to comment updates
                    # subscription, created = Comment_Subscription.objects.get_or_create(
                    #     user = blog.user,
                    #     post = post,
                    # )

                # if new info, update the posts
                if not created:
                    # print ".",
                    updated = False
                    if date != post.date_updated:
                        post.date_updated = date
                        updated = True
                    if title != post.title:
                        post.title = title
                        updated = True
                    if updated:
                        print "Updated %s in %s." % (title, blog.feed_url)
                        post.save()

        else:
            log.debug(str(errors))
Exemplo n.º 7
0
    def crawlblog(self, blog):

        print "** CRAWLING", blog.feed_url

        # Feedergrabber returns ( [(link, title, date)], [errors])
        # We're ignoring the errors returned for right now
        crawled, errors = feedergrabber27.feedergrabber(blog.feed_url)

        if crawled:

            for link, title, date in crawled:

                date = timezone.make_aware(date,
                                           timezone.get_default_timezone())
                now = timezone.make_aware(datetime.datetime.now(),
                                          timezone.get_default_timezone())

                title = cleantitle(title)

                # create the post instance if it doesn't already exist
                post, created = Post.objects.get_or_create(blog=blog,
                                                           url=link,
                                                           defaults={
                                                               'title':
                                                               title,
                                                               'date_updated':
                                                               date,
                                                           })

                if created:
                    print "Created", title
                    # Only post to humbug if the post was created in the last 2 days
                    #   so that new accounts don't spam humbug with their entire post list
                    if (now - date) < datetime.timedelta(days=2):
                        send_message_humbug(user=blog.user,
                                            link=link,
                                            title=title)

                # if new info, update the posts
                if not created:
                    print "Retrieved", title
                    updated = False
                    if date != post.date_updated:
                        post.date_updated = date
                        updated = True
                    if title != post.title:
                        post.title = title
                        updated = True
                    if updated:
                        print "Updated", title
                        post.save()

        else:
            log.debug(str(errors))
Exemplo n.º 8
0
    def crawlblog(self, blog):

        # Feedergrabber returns ( [(link, title, date)], [errors])
        # We're ignoring the errors returned for right now
        crawled, errors = feedergrabber27.feedergrabber(blog.feed_url)

        if crawled:

            for link, title, date in crawled:

                date = timezone.make_aware(date,
                                           timezone.get_default_timezone())
                now = timezone.make_aware(datetime.datetime.now(),
                                          timezone.get_default_timezone())

                title = cleantitle(title)

                # create the post instance if it doesn't already exist
                post, created = Post.objects.get_or_create(blog=blog,
                                                           url=link,
                                                           defaults={
                                                               'title':
                                                               title,
                                                               'date_updated':
                                                               date,
                                                           })

                if created:
                    print "Created '%s' from blog '%s'" % (title,
                                                           blog.feed_url)
                    # Only post to zulip if the post was created recently
                    #   so that new accounts don't spam zulip with their entire post list
                    if (now - date) < max_zulip_age:
                        post_page = ROOT_URL + 'post/' + Post.objects.get(
                            url=link).slug
                        self.enqueue_zulip(self.zulip_queue, blog.user,
                                           post_page, title, blog.stream)

                # if new info, update the posts
                if not created:
                    updated = False
                    if date != post.date_updated:
                        post.date_updated = date
                        updated = True
                    if title != post.title:
                        post.title = title
                        updated = True
                    if updated:
                        print "Updated %s in %s." % (title, blog.feed_url)
                        post.save()

        else:
            log.debug(str(errors))
Exemplo n.º 9
0
    def test_parsing_broken_feeds(self, feed):

        note(feed.feed)
        note(feed.items)

        with patch('urllib2.OpenerDirector.open', new=partial(self.patch_open_broken_feed, feed)):
            contents, errors = feedergrabber(feed.feed['link'])
            note(contents)
            note(errors)
            self.assertIsNone(contents)
            self.assertEqual(len(feed.items) + 1, len(errors))
            self.assertIn('Parsing methods not successful', errors[-1][0])
Exemplo n.º 10
0
 def test_parsing_broken_feeds(self, feed):
     note(feed.feed)
     note(feed.items)
     with patch(
             "urllib.request.OpenerDirector.open",
             new=partial(self.patch_open_broken_feed, feed),
     ):
         contents, errors = feedergrabber(feed.feed["link"])
         note(contents)
         note(errors)
         self.assertIsNone(contents)
         self.assertEqual(len(feed.items) + 1, len(errors))
         self.assertIn("Parsing methods not successful", errors[-1])
Exemplo n.º 11
0
    def test_parsing_broken_feeds(self, feed):

        note(feed.feed)
        note(feed.items)

        with patch('urllib2.OpenerDirector.open',
                   new=partial(self.patch_open_broken_feed, feed)):
            contents, errors = feedergrabber(feed.feed['link'])
            note(contents)
            note(errors)
            self.assertIsNone(contents)
            self.assertEqual(len(feed.items) + 1, len(errors))
            self.assertIn('Parsing methods not successful', errors[-1][0])
Exemplo n.º 12
0
    def crawlblog(self, blog):

        print "\n** CRAWLING", blog.feed_url

        # Feedergrabber returns ( [(link, title, date)], [errors])
        # We're ignoring the errors returned for right now
        crawled, errors = feedergrabber27.feedergrabber(blog.feed_url)

        if crawled:

            for link, title, date in crawled:

                date = timezone.make_aware(date, timezone.get_default_timezone())
                now = timezone.make_aware(datetime.datetime.now(), timezone.get_default_timezone())

                title = cleantitle(title)

                # create the post instance if it doesn't already exist
                post, created = Post.objects.get_or_create(
                    blog = blog,
                    url = link,
                    defaults = {
                        'title': title,
                        'date_updated': date,
                    }
                )

                if created:
                    print "Created", title
                    # Only post to humbug if the post was created in the last 2 days
                    #   so that new accounts don't spam humbug with their entire post list
                    if (now - date) < datetime.timedelta(days=2):
                        post_page = ROOT_URL + 'post/' + Post.objects.get(url=link).slug
                        send_message_humbug(user=blog.user, link=post_page, title=title)

                # if new info, update the posts
                if not created:
                    print ".",
                    updated = False
                    if date != post.date_updated:
                        post.date_updated = date
                        updated = True
                    if title != post.title:
                        post.title = title
                        updated = True
                    if updated:
                        print "Updated", title
                        post.save()

        else:
            log.debug(str(errors))
Exemplo n.º 13
0
    def crawlblog(self, blog):
        # Feedergrabber returns ( [(link, title, date)], [errors])
        # We're ignoring the errors returned for right now
        crawled, errors = feedergrabber27.feedergrabber(blog.feed_url)

        if crawled:
            post_count = 0
            for link, title, date in crawled:

                date = timezone.make_aware(date, timezone.get_default_timezone())
                now = timezone.make_aware(datetime.datetime.now(), timezone.get_default_timezone())

                title = cleantitle(title)

                # create the post instance if it doesn't already exist
                post, created = Post.objects.get_or_create(
                    blog = blog,
                    url = link,
                    defaults = {
                        'title': title,
                        'date_updated': date,
                    }
                )

                if created:
                    print "Created '%s' from blog '%s'" % (title, blog.feed_url)

                    # Throttle the amount of new posts that can be announced per user per crawl.
                    if post_count < MAX_POST_ANNOUNCE:
                        post_page = ROOT_URL + 'post/' + Post.objects.get(url=link).slug
                        self.enqueue_zulip(self.zulip_queue, blog.user, post_page, title, blog.stream)
                        post_count += 1

                # if new info, update the posts
                if not created:
                    updated = False
                    if date != post.date_updated:
                        post.date_updated = date
                        updated = True
                    if title != post.title:
                        post.title = title
                        updated = True
                    if updated:
                        print "Updated %s in %s." % (title, blog.feed_url)
                        post.save()

        else:
            log.debug(str(errors))
Exemplo n.º 14
0
    def handle_noargs(self, **options):

        for blog in Blog.objects.all():

            # Feedergrabber returns ( [(link, title, date)], [errors])
            # We're ignoring the errors returned for right now
            crawled, _ = feedergrabber27.feedergrabber(blog.feed_url)

            if crawled:

                for link, title, date in crawled:

                    date = timezone.make_aware(date, timezone.get_default_timezone())

                    # create the post instance if it doesn't already exist
                    post, created = Post.objects.get_or_create(
                        blog=blog, url=link, defaults={"title": title, "date_updated": date}
                    )

                    if created:
                        print "Created", title
                        send_message_hb(user=blog.user, link=link, title=title)

                    # if new info, update the posts
                    if not created:
                        print "Retrieved", title
                        updated = False
                        if date != post.date_updated:
                            post.date_updated = date
                            updated = True
                        if title != post.title:
                            post.title = title
                            updated = True
                        if updated:
                            print "Updated", title
                            post.save()

        if options["dry_run"]:
            transaction.rollback()
            print "\nDON'T FORGET TO RUN THIS FOR REAL\n"
        else:
            transaction.commit()
Exemplo n.º 15
0
    def crawlblog(self, blog):
        # Feedergrabber returns ( [(link, title, date)], [errors])
        # We're ignoring the errors returned for right now
        crawled, errors = feedergrabber27.feedergrabber(blog.feed_url)

        if not crawled:
            log.debug(str(errors))
            return

        log.debug('Crawled %s blogs from %s', len(crawled), blog.feed_url)

        created_count = 0
        for link, title, date, content in crawled:
            date = timezone.make_aware(date, timezone.get_default_timezone())
            title = cleantitle(title)

            # create the post instance if it doesn't already exist
            post, created = get_or_create_post(blog, title, link, date, content)

            if created:
                created_count += 1
                log.debug("Created '%s' from blog '%s'", title, blog.feed_url)

                # Throttle the amount of new posts that can be announced per
                # user per crawl.
                if created_count <= settings.MAX_POST_ANNOUNCE:
                    post_page = ROOT_URL + 'post/' + post.slug
                    self.enqueue_zulip(blog.user, post_page, title, blog.stream)

            # if title changes, update the post
            elif title != post.title or content != post.content:
                post.title = title
                post.content = content
                print "Updated %s in %s." % (title, blog.feed_url)
                post.save()

            else:
                # Any other updates are ignored, as of now
                pass
Exemplo n.º 16
0
    def test_parsing_valid_feeds(self, feed):

        note(feed.feed)
        note(feed.items)

        with patch('urllib2.OpenerDirector.open', new=partial(self.patch_open, feed)):
            contents, errors = feedergrabber(feed.feed['link'])
            if contents is None:
                note(errors)
                self.assertEqual(0, len(feed.items))
                self.assertEqual(1, len(errors))
                self.assertIn('Parsing methods not successful', errors[0][0])

            else:
                for i, (link, title, date, content) in enumerate(contents):
                    item = feed.items[i]
                    self.assertEqual(link, item['link'])
                    item_date = item.get('pubdate', item.get('updateddate'))
                    note(item_date)
                    note(date)
                    self.assertIsNotNone(date)
                    self.assertGreaterEqual(
                        datetime.datetime.now().utctimetuple(), date.utctimetuple()
                    )