Exemplo n.º 1
0
 def test_format_5(self):
     date = feedlib.parse_date('2011-10-21T17:30:00Z')
     self.assertEqual(str(date), '2011-10-21 17:30:00')
Exemplo n.º 2
0
 def _test_format_3(self):
     date = feedlib.parse_date('Sat, 22 Oct 2011 08:22:53 +0000')
     self.assertEqual(str(date), '2011-10-22 08:22:53')
Exemplo n.º 3
0
 def test_format_4(self):
     date = feedlib.parse_date('2011-10-13T22:09:45.314+02:00')
     self.assertEqual(str(date), '2011-10-13 22:09:45')
Exemplo n.º 4
0
 def _test_format_1(self):
     str = "Sun Jan 16 15:55:53 UTC 2011"
     date = feedlib.parse_date(str)
     print date
Exemplo n.º 5
0
 def test_format_2(self):
     date = feedlib.parse_date("Sun, 16 January 2011 07:13:33")
     self.assertEqual(str(date), '2011-01-16 07:13:33')
Exemplo n.º 6
0
 def test_format_5(self):
     date = feedlib.parse_date('2011-10-21T17:30:00Z')
     self.assertEqual(str(date), '2011-10-21 17:30:00')
Exemplo n.º 7
0
class ParseFeed:

    def invoke(self, feedid, lastmod):
        feedid = int(feedid)
        lastmod = lastmod.replace('%', ' ')
        if lastmod == "None":
            lastmod = None

        feed = dbimpl.feeddb.get_feed_by_id(feedid)
        if not feed: # might have been gc-ed in the meantime
            return

        items = {} # url -> item (so we can check for new ones)
        for item in feed.get_items():
            items[item.get_link()] = item

        # read xml
        try:
            file = os.path.join(FEED_CACHE, "feed-%s.rss" % feedid)
            site = rsslib.read_feed(file, rsslib.DefaultFactory(),
                                    rsslib.urllib_loader)
            feed.set_error(None)
            os.unlink(file)
        except Exception, e:
            # we failed, so record the failure and move on
            #traceback.print_exc()
            feed.set_error(str(e))
            feed.save()
            os.unlink(file)
            return

        # store all new items
        newposts = False
        for newitem in site.get_items():
            if items.has_key(newitem.get_link()):
                continue

            parsed_date = feedlib.parse_date(newitem.get_pubdate())

            # some sites give their articles a published date up to a
            # week in the future. these articles take a week before
            # they begin to age, staying on top of the feed for too
            # long.  we solve that by setting their time to _now_
            if parsed_date > datetime.datetime.utcnow():
                parsed_date = datetime.datetime.utcnow()

            newposts = True
            itemobj = dbimpl.Item(None, newitem.get_title(),
                                  newitem.get_link(), newitem.get_description(),
                                  parsed_date, newitem.get_author(), feed, None)
            # this sends MinHash message to create a minhash for the item
            itemobj.save() # FIXME: we could use batch updates here, too

        # update feed row
        feed.set_title(site.get_title())
        feed.set_link(site.get_link())
        feed.set_max_posts(feedlib.compute_max_posts(site))
        feed.set_last_modified(lastmod)
        feed.is_read_now()
        feed.save()

        # recalc all subs on this feed (if new posts, that is)
        if newposts:
            dbimpl.cur.execute("""select username from subscriptions where
                               feed = %s""", (feed.get_local_id(), ))
            for (user, ) in dbimpl.cur.fetchall():
                # 0 means we don't recalculate old posts. scores
                # haven't changed.  the only thing that's changed is
                # that we have new posts, so we only calculate those.
                dbimpl.mqueue.send("RecalculateSubscription %s %s 0" %
                                   (feed.get_local_id(), user))
Exemplo n.º 8
0
 def test_format_4(self):
     date = feedlib.parse_date('2011-10-13T22:09:45.314+02:00')
     self.assertEqual(str(date), '2011-10-13 22:09:45')
Exemplo n.º 9
0
 def _test_format_3(self):
     date = feedlib.parse_date('Sat, 22 Oct 2011 08:22:53 +0000')
     self.assertEqual(str(date), '2011-10-22 08:22:53')
Exemplo n.º 10
0
 def test_format_2(self):
     date = feedlib.parse_date("Sun, 16 January 2011 07:13:33")
     self.assertEqual(str(date), '2011-01-16 07:13:33')
Exemplo n.º 11
0
 def _test_format_1(self):
     str = "Sun Jan 16 15:55:53 UTC 2011"
     date = feedlib.parse_date(str)
     print date
Exemplo n.º 12
0
class AppEngineController(feedlib.Controller):
    def in_appengine(self):
        return True

    def is_single_user(self):
        return False

    def add_feed(self, url):
        # first check if the feed is in the database at all
        result = db.GqlQuery(
            """
         select * from GAEFeed where xmlurl = :1""", url)

        if not result.count():  # it's not there
            feed = GAEFeed()
            feed.xmlurl = url
            feed.subscribers = 1
        else:
            feed = result[0]
            feed.subscribers += 1
        feed.put()

        # now add a subscription for this user
        user = users.get_current_user()
        result = db.GqlQuery(
            """
         select * from GAESubscription where user = :1 and feed = :2
        """, user, feed)

        if result.count():  # we're subscribed already, so never mind
            return

        sub = GAESubscription()
        sub.user = user
        sub.feed = feed
        sub.up = 0
        sub.down = 0
        sub.put()

        if feed.subscribers > 1:
            # the feed was already there and loaded, so just calculate ratings
            # for this user
            self.queue_recalculate_subscription(sub)
        else:
            # we didn't have this feed from before, so let's start by
            # downloading it. this in turn will trigger a recalculation
            self.queue_check_feed(feed)

    def recalculate_subscription(self, key):
        sub = db.get(db.Key(key))
        feeddb.set_user(sub.user)  # no current user...
        result = db.GqlQuery("select * from GAEUser where user = :1", sub.user)
        if result.count() > 0:
            userobj = result[0]
            lastupdate = userobj.lastupdate or datetime.datetime.now()
        else:
            lastupdate = datetime.datetime.now()

        # get all seen posts for this subscription
        seen = {}  # post.key -> seen
        for s in db.GqlQuery(
                """
          select * from GAESeenPost where feed = :1 and user = :2""", sub.feed,
                sub.user):
            seen[s.post.key()] = s

        # get all existing ratings for this subscription
        ratings = {}  # post.key -> rating
        for rating in db.GqlQuery(
                """
          select * from GAEPostRating where feed = :1 and user = :2""",
                sub.feed, sub.user):
            key = rating.post.key()
            if seen.has_key(key):
                rating.delete()  # means we've already seen this post
            else:
                ratings[key] = rating

        thefeed = FeedWrapper(sub)

        # evaluate each post to see what to do
        count = 0
        total = 0
        toupdate = []
        for key in db.GqlQuery(
                """
          select __key__ from GAEPost where feed = :1""", sub.feed):
            total += 1

            if seen.has_key(key):
                continue  # we've seen this post before, so move on

            rating = ratings.get(key)
            if rating and rating.calculated > lastupdate:
                continue  # this rating is up to date, so ignore it

            post = db.get(key)
            if not rating:
                rating = GAEPostRating()
                rating.post = post
                rating.user = sub.user
                rating.feed = post.feed
                rating.postdate = post.pubdate
                oldpoints = 0
                newrating = True
            else:
                oldpoints = rating.points
                newrating = False

            thepost = PostWrapper(post, thefeed)
            thepost.recalculate()
            newpoints = thepost.get_points()
            if newrating or abs(oldpoints - newpoints) > 0.5:
                rating.prob = thepost.get_overall_probability()
                rating.calculated = datetime.datetime.now()
                rating.points = thepost.get_points()
                toupdate.append(rating)
            count += 1

        if toupdate:
            db.put(toupdate)
        logging.info("Recalculated %s posts (of %s; %s stored) for key %s" %
                     (count, total, len(toupdate), key))

    def recalculate_all_posts(self):
        user = users.get_current_user()  # not a task, so it's OK
        for sub in db.GqlQuery(
                "select __key__ from GAESubscription where user = :1", user):
            self.queue_recalculate_subscription(sub)

    def age_posts(self):
        for sub in db.GqlQuery("select __key__ from GAESubscription"):
            self.queue_age_subscription(sub)

    def age_subscription(self, key):
        sub = db.get(db.Key(key))
        count = 0
        toupdate = []
        for rating in db.GqlQuery(
                """select * from GAEPostRating
                                  where user = :1 and feed = :2""", sub.user,
                sub.feed):
            oldpoints = rating.points
            newpoints = feedlib.calculate_points(rating.prob, rating.postdate)
            count += 1
            if abs(oldpoints - newpoints) > 0.5:
                rating.points = newpoints
                toupdate.append(rating)

        if toupdate:
            db.put(toupdate)  # batch put is faster
        logging.info("Aged %s posts (really %s) for key %s" %
                     (count, len(toupdate), key))

    def start_feed_reader(self):
        pass

    def find_feeds_to_check(self):
        now = datetime.datetime.now()
        delta = datetime.timedelta(seconds=feedlib.TIME_TO_WAIT)
        checktime = now - delta

        # FIXME: we can easily get rid of this query, thus saving cputime
        result = db.GqlQuery("""
         select __key__ from GAEFeed where lastcheck = NULL""")

        for key in result:
            self.queue_check_feed(key)

        result = db.GqlQuery(
            """
         select __key__ from GAEFeed where lastcheck < :1""", checktime)

        for key in result:
            self.queue_check_feed(key)

    # --- specific to GAE (FIXME: but should it be?)

    def check_feed(self, key):
        feed = db.get(db.Key(key))
        if not feed:
            # this means the feed has been deleted, and so the task is no
            # longer necessary. we should just return and let the task die.
            logging.info("Tried to check non-existent feed with key " + key)
            return

        try:
            site = rsslib.read_feed(feed.xmlurl, data_loader=gae_loader)
        except Exception, e:
            # we failed, so record the failure and move on
            traceback.print_exc()
            feed.error = str(e)
            feed.lasterror = datetime.datetime.now()
            feed.put()
            return

        feed.title = site.get_title()
        feed.htmlurl = site.get_link()
        feed.lastcheck = datetime.datetime.now()
        feed.error = None
        feed.lasterror = None
        feed.maxposts = feedlib.compute_max_posts(site)

        feed.put()

        post_map = {}
        current_posts = db.GqlQuery(
            """
          select * from GAEPost where feed = :1""", feed)
        for post in current_posts:
            post_map[str(post.url)] = post

        newposts = False
        for item in site.get_items():
            post = post_map.get(item.get_link())
            if not post:
                post = GAEPost()
                post.url = item.get_link()
                post.feed = feed
                newposts = True

            post.title = item.get_title()
            post.author = item.get_author()
            post.content = db.Text(item.get_description())
            post.pubdate = feedlib.parse_date(item.get_pubdate())
            post.put()

        if newposts:
            # recalculate all subscriptions on this feed
            for sub in db.GqlQuery(
                    """
              select __key__ from GAESubscription where feed = :1""", feed):
                self.queue_recalculate_subscription(sub)