def test_format_5(self): date = feedlib.parse_date('2011-10-21T17:30:00Z') self.assertEqual(str(date), '2011-10-21 17:30:00')
def _test_format_3(self): date = feedlib.parse_date('Sat, 22 Oct 2011 08:22:53 +0000') self.assertEqual(str(date), '2011-10-22 08:22:53')
def test_format_4(self): date = feedlib.parse_date('2011-10-13T22:09:45.314+02:00') self.assertEqual(str(date), '2011-10-13 22:09:45')
def _test_format_1(self): str = "Sun Jan 16 15:55:53 UTC 2011" date = feedlib.parse_date(str) print date
def test_format_2(self): date = feedlib.parse_date("Sun, 16 January 2011 07:13:33") self.assertEqual(str(date), '2011-01-16 07:13:33')
class ParseFeed: def invoke(self, feedid, lastmod): feedid = int(feedid) lastmod = lastmod.replace('%', ' ') if lastmod == "None": lastmod = None feed = dbimpl.feeddb.get_feed_by_id(feedid) if not feed: # might have been gc-ed in the meantime return items = {} # url -> item (so we can check for new ones) for item in feed.get_items(): items[item.get_link()] = item # read xml try: file = os.path.join(FEED_CACHE, "feed-%s.rss" % feedid) site = rsslib.read_feed(file, rsslib.DefaultFactory(), rsslib.urllib_loader) feed.set_error(None) os.unlink(file) except Exception, e: # we failed, so record the failure and move on #traceback.print_exc() feed.set_error(str(e)) feed.save() os.unlink(file) return # store all new items newposts = False for newitem in site.get_items(): if items.has_key(newitem.get_link()): continue parsed_date = feedlib.parse_date(newitem.get_pubdate()) # some sites give their articles a published date up to a # week in the future. these articles take a week before # they begin to age, staying on top of the feed for too # long. we solve that by setting their time to _now_ if parsed_date > datetime.datetime.utcnow(): parsed_date = datetime.datetime.utcnow() newposts = True itemobj = dbimpl.Item(None, newitem.get_title(), newitem.get_link(), newitem.get_description(), parsed_date, newitem.get_author(), feed, None) # this sends MinHash message to create a minhash for the item itemobj.save() # FIXME: we could use batch updates here, too # update feed row feed.set_title(site.get_title()) feed.set_link(site.get_link()) feed.set_max_posts(feedlib.compute_max_posts(site)) feed.set_last_modified(lastmod) feed.is_read_now() feed.save() # recalc all subs on this feed (if new posts, that is) if newposts: dbimpl.cur.execute("""select username from subscriptions where feed = %s""", (feed.get_local_id(), )) for (user, ) in dbimpl.cur.fetchall(): # 0 means we don't recalculate old posts. scores # haven't changed. the only thing that's changed is # that we have new posts, so we only calculate those. dbimpl.mqueue.send("RecalculateSubscription %s %s 0" % (feed.get_local_id(), user))
class AppEngineController(feedlib.Controller): def in_appengine(self): return True def is_single_user(self): return False def add_feed(self, url): # first check if the feed is in the database at all result = db.GqlQuery( """ select * from GAEFeed where xmlurl = :1""", url) if not result.count(): # it's not there feed = GAEFeed() feed.xmlurl = url feed.subscribers = 1 else: feed = result[0] feed.subscribers += 1 feed.put() # now add a subscription for this user user = users.get_current_user() result = db.GqlQuery( """ select * from GAESubscription where user = :1 and feed = :2 """, user, feed) if result.count(): # we're subscribed already, so never mind return sub = GAESubscription() sub.user = user sub.feed = feed sub.up = 0 sub.down = 0 sub.put() if feed.subscribers > 1: # the feed was already there and loaded, so just calculate ratings # for this user self.queue_recalculate_subscription(sub) else: # we didn't have this feed from before, so let's start by # downloading it. this in turn will trigger a recalculation self.queue_check_feed(feed) def recalculate_subscription(self, key): sub = db.get(db.Key(key)) feeddb.set_user(sub.user) # no current user... result = db.GqlQuery("select * from GAEUser where user = :1", sub.user) if result.count() > 0: userobj = result[0] lastupdate = userobj.lastupdate or datetime.datetime.now() else: lastupdate = datetime.datetime.now() # get all seen posts for this subscription seen = {} # post.key -> seen for s in db.GqlQuery( """ select * from GAESeenPost where feed = :1 and user = :2""", sub.feed, sub.user): seen[s.post.key()] = s # get all existing ratings for this subscription ratings = {} # post.key -> rating for rating in db.GqlQuery( """ select * from GAEPostRating where feed = :1 and user = :2""", sub.feed, sub.user): key = rating.post.key() if seen.has_key(key): rating.delete() # means we've already seen this post else: ratings[key] = rating thefeed = FeedWrapper(sub) # evaluate each post to see what to do count = 0 total = 0 toupdate = [] for key in db.GqlQuery( """ select __key__ from GAEPost where feed = :1""", sub.feed): total += 1 if seen.has_key(key): continue # we've seen this post before, so move on rating = ratings.get(key) if rating and rating.calculated > lastupdate: continue # this rating is up to date, so ignore it post = db.get(key) if not rating: rating = GAEPostRating() rating.post = post rating.user = sub.user rating.feed = post.feed rating.postdate = post.pubdate oldpoints = 0 newrating = True else: oldpoints = rating.points newrating = False thepost = PostWrapper(post, thefeed) thepost.recalculate() newpoints = thepost.get_points() if newrating or abs(oldpoints - newpoints) > 0.5: rating.prob = thepost.get_overall_probability() rating.calculated = datetime.datetime.now() rating.points = thepost.get_points() toupdate.append(rating) count += 1 if toupdate: db.put(toupdate) logging.info("Recalculated %s posts (of %s; %s stored) for key %s" % (count, total, len(toupdate), key)) def recalculate_all_posts(self): user = users.get_current_user() # not a task, so it's OK for sub in db.GqlQuery( "select __key__ from GAESubscription where user = :1", user): self.queue_recalculate_subscription(sub) def age_posts(self): for sub in db.GqlQuery("select __key__ from GAESubscription"): self.queue_age_subscription(sub) def age_subscription(self, key): sub = db.get(db.Key(key)) count = 0 toupdate = [] for rating in db.GqlQuery( """select * from GAEPostRating where user = :1 and feed = :2""", sub.user, sub.feed): oldpoints = rating.points newpoints = feedlib.calculate_points(rating.prob, rating.postdate) count += 1 if abs(oldpoints - newpoints) > 0.5: rating.points = newpoints toupdate.append(rating) if toupdate: db.put(toupdate) # batch put is faster logging.info("Aged %s posts (really %s) for key %s" % (count, len(toupdate), key)) def start_feed_reader(self): pass def find_feeds_to_check(self): now = datetime.datetime.now() delta = datetime.timedelta(seconds=feedlib.TIME_TO_WAIT) checktime = now - delta # FIXME: we can easily get rid of this query, thus saving cputime result = db.GqlQuery(""" select __key__ from GAEFeed where lastcheck = NULL""") for key in result: self.queue_check_feed(key) result = db.GqlQuery( """ select __key__ from GAEFeed where lastcheck < :1""", checktime) for key in result: self.queue_check_feed(key) # --- specific to GAE (FIXME: but should it be?) def check_feed(self, key): feed = db.get(db.Key(key)) if not feed: # this means the feed has been deleted, and so the task is no # longer necessary. we should just return and let the task die. logging.info("Tried to check non-existent feed with key " + key) return try: site = rsslib.read_feed(feed.xmlurl, data_loader=gae_loader) except Exception, e: # we failed, so record the failure and move on traceback.print_exc() feed.error = str(e) feed.lasterror = datetime.datetime.now() feed.put() return feed.title = site.get_title() feed.htmlurl = site.get_link() feed.lastcheck = datetime.datetime.now() feed.error = None feed.lasterror = None feed.maxposts = feedlib.compute_max_posts(site) feed.put() post_map = {} current_posts = db.GqlQuery( """ select * from GAEPost where feed = :1""", feed) for post in current_posts: post_map[str(post.url)] = post newposts = False for item in site.get_items(): post = post_map.get(item.get_link()) if not post: post = GAEPost() post.url = item.get_link() post.feed = feed newposts = True post.title = item.get_title() post.author = item.get_author() post.content = db.Text(item.get_description()) post.pubdate = feedlib.parse_date(item.get_pubdate()) post.put() if newposts: # recalculate all subscriptions on this feed for sub in db.GqlQuery( """ select __key__ from GAESubscription where feed = :1""", feed): self.queue_recalculate_subscription(sub)