def test_load_feeds__gawker(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'gawker1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='gawker') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) feed.update(force=True) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) management.call_command('loaddata', 'gawker2.json', verbosity=0) feed.update(force=True) # Test: 1 changed char in content stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) url = reverse('load-single-feed', kwargs=dict(feed_id=1)) response = self.client.get(url) feed = json.decode(response.content) self.assertEquals(len(feed['stories']), 6)
def original_text(request): story_id = request.REQUEST.get('story_id') feed_id = request.REQUEST.get('feed_id') story_hash = request.REQUEST.get('story_hash', None) force = request.REQUEST.get('force', False) debug = request.REQUEST.get('debug', False) if story_hash: story, _ = MStory.find_story(story_hash=story_hash) else: story, _ = MStory.find_story(story_id=story_id, story_feed_id=feed_id) if not story: logging.user(request, "~FYFetching ~FGoriginal~FY story text: ~FRstory not found") return {'code': -1, 'message': 'Story not found.', 'original_text': None, 'failed': True} original_text = story.fetch_original_text(force=force, request=request, debug=debug) return { 'feed_id': story.story_feed_id, 'story_hash': story.story_hash, 'story_id': story.story_guid, 'image_urls': story.image_urls, 'secure_image_urls': Feed.secure_image_urls(story.image_urls), 'original_text': original_text, 'failed': not original_text or len(original_text) < 100, }
def test_load_feeds__gothamist(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='gothamist') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=4, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 42) url = reverse('load-single-feed', kwargs=dict(feed_id=4)) response = self.client.get(url) content = json.decode(response.content) self.assertEquals(len(content['stories']), 6) management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=4, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 42) url = reverse('load-single-feed', kwargs=dict(feed_id=4)) response = self.client.get(url) # print [c['story_title'] for c in json.decode(response.content)] content = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(content['stories']), 6)
def reindex_stories(): count = MStory.objects().count() print "Mongo DB stories: %s" % count p = 0.0 i = 0 feeds = Feed.objects.all().order_by('-average_stories_per_month') feed_count = feeds.count() f = 0 for feed in feeds: f += 1 print "%s/%s: %s" % (f, feed_count, feed,) sys.stdout.flush() for story in MStory.objects(story_feed_id=feed.pk): i += 1.0 if round(i / count * 100) != p: p = round(i / count * 100) print '%s%%' % p if isinstance(story.id, unicode) and story.id: story.story_guid = story.id story.id = pymongo.objectid.ObjectId() try: story.save() except mongoengine.queryset.OperationError: print 'Dupe!' continue
def test_load_feeds__slashdot(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'slashdot1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='slashdot') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) management.call_command('loaddata', 'slashdot2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) response = self.client.post('/reader/feed', { "feed_id": 5 }) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 30)
def mark_story_as_unread(request): story_id = request.POST['story_id'] feed_id = int(request.POST['feed_id']) usersub = UserSubscription.objects.select_related('feed').get(user=request.user, feed=feed_id) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=dict(story_id=story_id)) logging.user(request, "~FY~SBUnread~SN story in feed: %s" % (usersub.feed)) story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] if story.story_date < usersub.mark_read_date: # Story is outside the mark as read range, so invert all stories before. newer_stories = MStory.objects(story_feed_id=story.story_feed_id, story_date__gte=story.story_date, story_date__lte=usersub.mark_read_date ).only('story_guid') newer_stories = [s.story_guid for s in newer_stories] usersub.mark_read_date = story.story_date - datetime.timedelta(minutes=1) usersub.needs_unread_recalc = True usersub.save() # Mark stories as read only after the mark_read_date has been moved, otherwise # these would be ignored. data = usersub.mark_story_ids_as_read(newer_stories, request=request) m = MUserStory.objects(story_id=story_id, user_id=request.user.pk, feed_id=feed_id) m.delete() return data
def bootstrap_stories(): print "Mongo DB stories: %s" % MStory.objects().count() # db.stories.drop() print "Dropped! Mongo DB stories: %s" % MStory.objects().count() print "Stories: %s" % Story.objects.all().count() pprint(db.stories.index_information()) feeds = Feed.objects.all().order_by('-average_stories_per_month') feed_count = feeds.count() i = 0 for feed in feeds: i += 1 print "%s/%s: %s (%s stories)" % (i, feed_count, feed, Story.objects.filter(story_feed=feed).count()) sys.stdout.flush() stories = Story.objects.filter(story_feed=feed).values() for story in stories: # story['story_tags'] = [tag.name for tag in Tag.objects.filter(story=story['id'])] try: story['story_tags'] = json.decode(story['story_tags']) except: continue del story['id'] del story['story_author_id'] try: MStory(**story).save() except: continue print "\nMongo DB stories: %s" % MStory.objects().count()
def reindex_stories(): db = pymongo.Connection().newsblur count = MStory.objects().count() print "Mongo DB stories: %s" % count p = 0.0 i = 0 feeds = Feed.objects.all().order_by('-average_stories_per_month') feed_count = feeds.count() f = 0 for feed in feeds: f += 1 print "%s/%s: %s" % (f, feed_count, feed,) sys.stdout.flush() for story in MStory.objects(story_feed_id=feed.pk): i += 1.0 if round(i / count * 100) != p: p = round(i / count * 100) print '%s%%' % p if isinstance(story.id, unicode): story.story_guid = story.id story.id = pymongo.objectid.ObjectId() try: story.save() except OperationError, e: print " ***> OperationError: %s" % e except e: print ' ***> Unknown Error: %s' % e db.stories.remove({"_id": story.story_guid})
def receive_newsletter(self, params): user = self.user_from_email(params['recipient']) if not user: return sender_name, sender_username, sender_domain = self.split_sender(params['from']) feed_address = self.feed_address(user, "%s@%s" % (sender_username, sender_domain)) usf = UserSubscriptionFolders.objects.get(user=user) usf.add_folder('', 'Newsletters') try: feed = Feed.objects.get(feed_address=feed_address) except Feed.DoesNotExist: feed = Feed.objects.create(feed_address=feed_address, feed_link='http://' + sender_domain, feed_title=sender_name, fetched_once=True, known_good=True) feed.update() logging.user(user, "~FCCreating newsletter feed: ~SB%s" % (feed)) r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL) r.publish(user.username, 'reload:%s' % feed.pk) try: usersub = UserSubscription.objects.get(user=user, feed=feed) except UserSubscription.DoesNotExist: _, _, usersub = UserSubscription.add_subscription( user=user, feed_address=feed_address, folder='Newsletters' ) story_hash = MStory.ensure_story_hash(params['signature'], feed.pk) story_params = { "story_feed_id": feed.pk, "story_date": datetime.datetime.fromtimestamp(int(params['timestamp'])), "story_title": params['subject'], "story_content": self.get_content(params), "story_author_name": escape(params['from']), "story_permalink": reverse('newsletter-story', kwargs={'story_hash': story_hash}), "story_guid": params['signature'], } try: story = MStory.objects.get(story_hash=story_hash) except MStory.DoesNotExist: story = MStory(**story_params) story.save() usersub.needs_unread_recalc = True usersub.save() self.publish_to_subscribers(feed) MFetchHistory.add(feed_id=feed.pk, fetch_type='push') logging.user(user, "~FCNewsletter feed story: ~SB%s~SN / ~SB%s" % (story.story_title, feed)) return story
def count_unreads_for_subscribers(self, feed): user_subs = UserSubscription.objects.filter( feed=feed, active=True, user__profile__last_seen_on__gte=feed.unread_cutoff ).order_by("-last_read_date") if not user_subs.count(): return for sub in user_subs: if not sub.needs_unread_recalc: sub.needs_unread_recalc = True sub.save() if self.options["compute_scores"]: r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL) stories = MStory.objects(story_feed_id=feed.pk, story_date__gte=feed.unread_cutoff) stories = Feed.format_stories(stories, feed.pk) story_hashes = r.zrangebyscore( "zF:%s" % feed.pk, int(feed.unread_cutoff.strftime("%s")), int(time.time() + 60 * 60 * 24) ) missing_story_hashes = set(story_hashes) - set([s["story_hash"] for s in stories]) if missing_story_hashes: missing_stories = MStory.objects( story_feed_id=feed.pk, story_hash__in=missing_story_hashes ).read_preference(pymongo.ReadPreference.PRIMARY) missing_stories = Feed.format_stories(missing_stories, feed.pk) stories = missing_stories + stories logging.debug( u" ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores" % (feed.title[:30], len(missing_stories), len(missing_story_hashes), len(stories)) ) cache.set("S:%s" % feed.pk, stories, 60) logging.debug( u" ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)" % ( feed.title[:30], len(stories), user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers, ) ) self.calculate_feed_scores_with_stories(user_subs, stories) elif self.options.get("mongodb_replication_lag"): logging.debug( u" ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag" % (feed.title[:30], self.options.get("mongodb_replication_lag")) )
def mark_feed_read(self): now = datetime.datetime.now() if MStory.objects(story_feed_id=self.feed.pk).first(): latest_story_date = MStory.objects(story_feed_id=self.feed.pk).order_by('-story_date')[0].story_date\ + datetime.timedelta(minutes=1) else: latest_story_date = now self.last_read_date = max(now, latest_story_date) self.mark_read_date = max(now, latest_story_date) self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = max(now, latest_story_date) self.needs_unread_relcalc = False self.save()
def mark_read(cls, user_id, story_feed_id, story_hash, r=None, r2=None): if not r: r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL) if not r2: r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2) story_hash = MStory.ensure_story_hash(story_hash, story_feed_id=story_feed_id) if not story_hash: return now = int(time.time()) all_read_stories_key = 'RS:%s' % (user_id) r.sadd(all_read_stories_key, story_hash) r2.sadd(all_read_stories_key, story_hash) r2.zadd('z' + all_read_stories_key, story_hash, now) r.expire(all_read_stories_key, settings.DAYS_OF_UNREAD*24*60*60) r2.expire(all_read_stories_key, settings.DAYS_OF_UNREAD*24*60*60) r2.expire('z' + all_read_stories_key, settings.DAYS_OF_UNREAD*24*60*60) read_story_key = 'RS:%s:%s' % (user_id, story_feed_id) r.sadd(read_story_key, story_hash) r2.sadd(read_story_key, story_hash) r2.zadd('z' + read_story_key, story_hash, now) r.expire(read_story_key, settings.DAYS_OF_UNREAD*24*60*60) r2.expire(read_story_key, settings.DAYS_OF_UNREAD*24*60*60) r2.expire('z' + read_story_key, settings.DAYS_OF_UNREAD*24*60*60)
def story_hash(cls, story_id, story_feed_id): if not cls.RE_STORY_HASH.match(story_id): story, _ = MStory.find_story(story_feed_id=story_feed_id, story_id=story_id) if not story: return story_id = story.story_hash return story_id
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter( feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF ).order_by("-last_read_date") logging.debug( u" ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers" % ( unicode(feed)[:30], user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers, ) ) stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) for sub in user_subs: cache.delete("usersub:%s" % sub.user_id) sub.needs_unread_recalc = True sub.save() if self.options["compute_scores"]: for sub in user_subs: silent = False if self.options["verbose"] >= 2 else True sub.calculate_feed_scores(silent=silent, stories_db=stories_db)
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter( feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF ).order_by("-last_read_date") for sub in user_subs: if not sub.needs_unread_recalc: sub.needs_unread_recalc = True sub.save() if self.options["compute_scores"]: stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) logging.debug( u" ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)" % ( feed.title[:30], stories_db.count(), user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers, ) ) self.calculate_feed_scores_with_stories(user_subs, stories_db) elif self.options.get("mongodb_replication_lag"): logging.debug( u" ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag" % (feed.title[:30], self.options.get("mongodb_replication_lag")) )
def handle(self, *args, **options): if options['daemonize']: daemonize() settings.LOG_TO_STREAM = True r = redis.Redis(connection_pool=settings.REDIS_FEED_POOL) if options['initialize']: feeds = Feed.objects.filter(num_subscribers__gte=1).order_by('?') print 'Query feeds done with num of feeds',len(feeds) r.ltrim('freeze_feeds',1,0) pipeline = r.pipeline() for feed in feeds: pipeline.rpush('freeze_feeds',feed.pk) pipeline.execute() print 'Initialize freeze_feeds done' feed_id = r.lpop('freeze_feeds') while feed_id: try: frozen_num = MStory.freeze_feed(int(feed_id)) if frozen_num > 0: r.rpush('freeze_feeds',feed_id) except Exception, e: logging.error(str(e)+\ traceback.format_exc()+'\n'+\ 'Error from: freeze_feeds\n') feed_id = r.lpop('freeze_feeds')
def mark_story_as_read(request): story_ids = request.REQUEST.getlist("story_id") feed_id = int(request.REQUEST["feed_id"]) try: usersub = UserSubscription.objects.select_related("feed").get(user=request.user, feed=feed_id) except Feed.DoesNotExist: duplicate_feed = DuplicateFeed.objects.filter(duplicate_feed_id=feed_id) if duplicate_feed: try: usersub = UserSubscription.objects.get(user=request.user, feed=duplicate_feed[0].feed) except Feed.DoesNotExist: return dict(code=-1) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=story_ids) if len(story_ids) > 1: logging.info(" ---> [%s] ~FYRead %s stories in feed: %s" % (request.user, len(story_ids), usersub.feed)) else: logging.info(" ---> [%s] ~FYRead story in feed: %s" % (request.user, usersub.feed)) for story_id in story_ids: story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] now = datetime.datetime.utcnow() m = MUserStory(story=story, user_id=request.user.pk, feed_id=feed_id, read_date=now) try: m.save() except OperationError: logging.info(" ---> [%s] ~BRMarked story as read: Duplicate Story -> %s" % (request.user, story_id)) return data
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter(feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF)\ .order_by('-last_read_date') if not user_subs.count(): return for sub in user_subs: if not sub.needs_unread_recalc: sub.needs_unread_recalc = True sub.save() if self.options['compute_scores']: stories = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF)\ .read_preference(pymongo.ReadPreference.PRIMARY) stories = Feed.format_stories(stories, feed.pk) logging.debug(u' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % ( feed.title[:30], len(stories), user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) self.calculate_feed_scores_with_stories(user_subs, stories) elif self.options.get('mongodb_replication_lag'): logging.debug(u' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % ( feed.title[:30], self.options.get('mongodb_replication_lag')))
def bootstrap_userstories(): print "Mongo DB userstories: %s" % MUserStory.objects().count() # db.userstories.drop() print "Dropped! Mongo DB userstories: %s" % MUserStory.objects().count() print "UserStories: %s" % UserStory.objects.all().count() pprint(db.userstories.index_information()) userstories = UserStory.objects.all().values() for userstory in userstories: try: story = Story.objects.get(pk=userstory['story_id']) except Story.DoesNotExist: continue try: userstory['story'] = MStory.objects(story_feed_id=story.story_feed.pk, story_guid=story.story_guid)[0] except: print '!', continue print '.', del userstory['id'] del userstory['opinion'] del userstory['story_id'] try: MUserStory(**userstory).save() except: print '\n\n!\n\n' continue print "\nMongo DB userstories: %s" % MUserStory.objects().count()
def mark_feed_read(self): if (self.unread_count_negative == 0 and self.unread_count_neutral == 0 and self.unread_count_positive == 0 and not self.needs_unread_recalc): return now = datetime.datetime.utcnow() # Use the latest story to get last read time. latest_story = MStory.objects(story_feed_id=self.feed.pk).order_by('-story_date').only('story_date').limit(1) if latest_story and len(latest_story) >= 1: latest_story_date = latest_story[0]['story_date']\ + datetime.timedelta(seconds=1) else: latest_story_date = now self.last_read_date = latest_story_date self.mark_read_date = latest_story_date self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = now self.oldest_unread_story_date = now self.needs_unread_recalc = False self.save() return True
def mark_feed_read(self, cutoff_date=None): if (self.unread_count_negative == 0 and self.unread_count_neutral == 0 and self.unread_count_positive == 0 and not self.needs_unread_recalc): return recount = True # Use the latest story to get last read time. if cutoff_date: cutoff_date = cutoff_date + datetime.timedelta(seconds=1) else: latest_story = MStory.objects(story_feed_id=self.feed.pk).order_by('-story_date').only('story_date').limit(1) if latest_story and len(latest_story) >= 1: cutoff_date = (latest_story[0]['story_date'] + datetime.timedelta(seconds=1)) else: cutoff_date = datetime.datetime.utcnow() recount = False self.last_read_date = cutoff_date self.mark_read_date = cutoff_date self.oldest_unread_story_date = cutoff_date if not recount: self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = datetime.datetime.utcnow() self.needs_unread_recalc = False else: self.needs_unread_recalc = True self.save() return True
def mark_read(cls, user_id, story_feed_id, story_hash, social_user_ids=None, r=None): if not r: r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL) # if not r2: # r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2) story_hash = MStory.ensure_story_hash(story_hash, story_feed_id=story_feed_id) if not story_hash: return def redis_commands(key): r.sadd(key, story_hash) # r2.sadd(key, story_hash) r.expire(key, settings.DAYS_OF_STORY_HASHES*24*60*60) # r2.expire(key, settings.DAYS_OF_STORY_HASHES*24*60*60) all_read_stories_key = 'RS:%s' % (user_id) redis_commands(all_read_stories_key) read_story_key = 'RS:%s:%s' % (user_id, story_feed_id) redis_commands(read_story_key) if social_user_ids: for social_user_id in social_user_ids: social_read_story_key = 'RS:%s:B:%s' % (user_id, social_user_id) redis_commands(social_read_story_key)
def mark_feed_read(self): now = datetime.datetime.utcnow() # Use the latest story to get last read time. latest_story = MStory.objects(story_feed_id=self.feed.pk).order_by('-story_date').only('story_date').limit(1) if latest_story and len(latest_story) >= 1: latest_story_date = latest_story[0]['story_date']\ + datetime.timedelta(seconds=1) else: latest_story_date = now self.last_read_date = latest_story_date self.mark_read_date = latest_story_date self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = now self.oldest_unread_story_date = now self.needs_unread_recalc = False # No longer removing old user read stories, since they're needed for social, # and they get cleaned up automatically when new stories come in. # MUserStory.delete_old_stories(self.user_id, self.feed_id) self.save()
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter(feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF)\ .order_by('-last_read_date') logging.debug(u' ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers' % ( unicode(feed)[:30], user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) if self.options['slave_db']: slave_db = self.options['slave_db'] stories_db_orig = slave_db.stories.find({ "story_feed_id": feed.pk, "story_date": { "$gte": UNREAD_CUTOFF, }, }) stories_db = [] for story in stories_db_orig: stories_db.append(bunch(story)) else: stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) for sub in user_subs: cache.delete('usersub:%s' % sub.user_id) sub.needs_unread_recalc = True sub.save() if self.options['compute_scores']: for sub in user_subs: silent = False if self.options['verbose'] >= 2 else True sub.calculate_feed_scores(silent=silent, stories_db=stories_db)
def mark_story_hashes_read(cls, user_id, story_hashes, r=None, s=None): if not r: r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL) if not s: s = redis.Redis(connection_pool=settings.REDIS_POOL) # if not r2: # r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2) p = r.pipeline() # p2 = r2.pipeline() feed_ids = set() friend_ids = set() if not isinstance(story_hashes, list): story_hashes = [story_hashes] for story_hash in story_hashes: feed_id, _ = MStory.split_story_hash(story_hash) feed_ids.add(feed_id) # Find other social feeds with this story to update their counts friend_key = "F:%s:F" % (user_id) share_key = "S:%s" % (story_hash) friends_with_shares = [int(f) for f in s.sinter(share_key, friend_key)] friend_ids.update(friends_with_shares) cls.mark_read(user_id, feed_id, story_hash, social_user_ids=friends_with_shares, r=p) p.execute() # p2.execute() return list(feed_ids), list(friend_ids)
def imagesearch(request,page_num): if page_num == '': page_num = '1' page_num = int(page_num) num_per_page = 15 q = request.GET.get('q',None) if q: image_server = settings.FDFS_HTTP_SERVER index_stories = SearchStory.query(q)[:500] response_images = [] for index_story in index_stories: story = MStory.objects(id=index_story['db_id']).first() if story and story.image_ids: for image_id in story.image_ids: if len(image_id) > 20: # print image_id image = MImage.objects(id=image_id).first() imagedict = dict( image_url=image_server+image.image_remote_id, story_url=story.story_guid, story_title = story.story_title, ) response_images.append(imagedict) if len(response_images)>=50: return render(request,'imagesearch.html',locals()) return render(request,'imagesearch.html',locals())
def mark_story_as_read(request): story_ids = request.REQUEST.getlist('story_id') feed_id = int(request.REQUEST['feed_id']) usersub = UserSubscription.objects.select_related('feed').get(user=request.user, feed=feed_id) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=story_ids) if len(story_ids) > 1: logging.debug(" ---> [%s] Read %s stories in feed: %s" % (request.user, len(story_ids), usersub.feed)) else: logging.debug(" ---> [%s] Read story in feed: %s" % (request.user, usersub.feed)) for story_id in story_ids: story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] now = datetime.datetime.utcnow() m = MUserStory(story=story, user_id=request.user.pk, feed_id=feed_id, read_date=now) try: m.save() except OperationError: logging.info(' ---> [%s] *** Marked story as read: Duplicate Story -> %s' % (request.user, story_id)) return data
def mark_story_ids_as_read(self, story_ids, request=None): data = dict(code=0, payload=story_ids) if not request: request = self.user if not self.needs_unread_recalc: self.needs_unread_recalc = True self.save() if len(story_ids) > 1: logging.user(request, "~FYRead %s stories in feed: %s" % (len(story_ids), self.feed)) else: logging.user(request, "~FYRead story in feed: %s" % (self.feed)) for story_id in set(story_ids): story, _ = MStory.find_story(story_feed_id=self.feed_id, story_id=story_id) if not story: continue now = datetime.datetime.utcnow() date = now if now > story.story_date else story.story_date # For handling future stories m, _ = MUserStory.objects.get_or_create(story_id=story_id, user_id=self.user_id, feed_id=self.feed_id, defaults={ 'read_date': date, 'story': story, 'story_date': story.story_date, }) return data
def mark_story_as_unread(request): story_id = request.POST['story_id'] feed_id = int(request.POST['feed_id']) try: usersub = UserSubscription.objects.select_related('feed').get(user=request.user, feed=feed_id) except Feed.DoesNotExist: duplicate_feed = DuplicateFeed.objects.filter(duplicate_feed_id=feed_id) if duplicate_feed: try: usersub = UserSubscription.objects.get(user=request.user, feed=duplicate_feed[0].feed) except Feed.DoesNotExist: return dict(code=-1) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=dict(story_id=story_id)) logging.user(request, "~FY~SBUnread~SN story in feed: %s" % (usersub.feed)) story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] m = MUserStory.objects(story=story, user_id=request.user.pk, feed_id=feed_id) m.delete() return data
def switch_feed(cls, user_id, old_feed_id, new_feed_id): r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL) # r2 = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL2) p = r.pipeline() # p2 = r2.pipeline() story_hashes = cls.get_stories(user_id, old_feed_id, r=r) for story_hash in story_hashes: _, hash_story = MStory.split_story_hash(story_hash) new_story_hash = "%s:%s" % (new_feed_id, hash_story) read_feed_key = "RS:%s:%s" % (user_id, new_feed_id) p.sadd(read_feed_key, new_story_hash) # p2.sadd(read_feed_key, new_story_hash) p.expire(read_feed_key, settings.DAYS_OF_STORY_HASHES*24*60*60) # p2.expire(read_feed_key, settings.DAYS_OF_STORY_HASHES*24*60*60) read_user_key = "RS:%s" % (user_id) p.sadd(read_user_key, new_story_hash) # p2.sadd(read_user_key, new_story_hash) p.expire(read_user_key, settings.DAYS_OF_STORY_HASHES*24*60*60) # p2.expire(read_user_key, settings.DAYS_OF_STORY_HASHES*24*60*60) p.execute() # p2.execute() if len(story_hashes) > 0: logging.info(" ---> %s read stories" % len(story_hashes))
def process(self, first_run=True): """ Downloads and parses a feed. """ self.refresh_feed() ret_values = { ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0 } # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) self.feed.fetched_once = True self.feed.last_update = datetime.datetime.utcnow() if hasattr(self.fpf, 'status'): if self.options['verbose']: logging.debug( u' ---> [%-30s] Fetched feed, HTTP status %d: %s (bozo: %s)' % (unicode(self.feed)[:30], self.fpf.status, self.feed.feed_address, self.fpf.bozo)) if self.fpf.bozo and self.fpf.status != 304: logging.debug( u' ---> [%-30s] BOZO exception: %s (%s entries)' % (unicode(self.feed)[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values if self.fpf.status in (302, 301): if not self.fpf.href.endswith('feedburner.com/atom.xml'): self.feed.feed_address = self.fpf.href if first_run: self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] HTTP Status code: %s. Checking address..." % (unicode(self.feed)[:30], self.fpf.status)) fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRHTTP, ret_values if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): if not self.fpf.entries: logging.debug( " ---> [%-30s] Feed is Non-XML. %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries))) fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] Feed is Bad XML (SAX). %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries))) if not self.fpf.entries: fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception) else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: pass self.fpf.entries = self.fpf.entries[:50] self.feed.feed_title = self.fpf.feed.get('title', self.feed.feed_title) tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) if tagline: self.feed.data.feed_tagline = utf8encode(tagline) self.feed.data.save() self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get( 'id') or self.feed.feed_link self.feed.last_update = datetime.datetime.utcnow() guids = [] for entry in self.fpf.entries: if entry.get('id', ''): guids.append(entry.get('id', '')) elif entry.get('link'): guids.append(entry.link) elif entry.get('title'): guids.append(entry.title) self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() # end_date = datetime.datetime.utcnow() story_guids = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') # if story.get('published') > end_date: # end_date = story.get('published') story_guids.append(story.get('guid') or story.get('link')) existing_stories = list( MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed.pk).limit(len(story_guids))) # MStory.objects( # (Q(story_date__gte=start_date) & Q(story_date__lte=end_date)) # | (Q(story_guid__in=story_guids)), # story_feed=self.feed # ).order_by('-story_date') ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories) logging.debug(u' ---> [%-30s] Parsed Feed: %s' % ( unicode(self.feed)[:30], u' '.join(u'%s=%d' % (self.entry_trans[key], ret_values[key]) for key in self.entry_keys), )) self.feed.update_all_statistics() self.feed.trim_feed() self.feed.save_feed_history(200, "OK") return FEED_OK, ret_values
def calculate_feed_scores(self, silent=False, stories=None, force=False): # now = datetime.datetime.strptime("2009-07-06 22:30:03", "%Y-%m-%d %H:%M:%S") now = datetime.datetime.now() UNREAD_CUTOFF = now - datetime.timedelta(days=settings.DAYS_OF_UNREAD) if self.user.profile.last_seen_on < UNREAD_CUTOFF and not force: # if not silent: # logging.info(' ---> [%s] SKIPPING Computing scores: %s (1 week+)' % (self.user, self.feed)) return # if not self.feed.fetched_once: # if not silent: # logging.info(' ---> [%s] NOT Computing scores: %s' % (self.user, self.feed)) # self.needs_unread_recalc = False # self.save() # return feed_scores = dict(negative=0, neutral=0, positive=0) # Two weeks in age. If mark_read_date is older, mark old stories as read. date_delta = UNREAD_CUTOFF if date_delta < self.mark_read_date: date_delta = self.mark_read_date else: self.mark_read_date = date_delta if not stories: stories = cache.get('S:%s' % self.feed_id) unread_story_hashes = self.get_stories(read_filter='unread', limit=500, hashes_only=True) if not stories: stories_db = MStory.objects(story_hash__in=unread_story_hashes) stories = Feed.format_stories(stories_db, self.feed_id) oldest_unread_story_date = now unread_stories = [] for story in stories: if story['story_date'] < date_delta: continue if story['story_hash'] in unread_story_hashes: unread_stories.append(story) if story['story_date'] < oldest_unread_story_date: oldest_unread_story_date = story['story_date'] # if not silent: # logging.info(' ---> [%s] Format stories: %s' % (self.user, datetime.datetime.now() - now)) classifier_feeds = list(MClassifierFeed.objects(user_id=self.user_id, feed_id=self.feed_id, social_user_id=0)) classifier_authors = list(MClassifierAuthor.objects(user_id=self.user_id, feed_id=self.feed_id)) classifier_titles = list(MClassifierTitle.objects(user_id=self.user_id, feed_id=self.feed_id)) classifier_tags = list(MClassifierTag.objects(user_id=self.user_id, feed_id=self.feed_id)) # if not silent: # logging.info(' ---> [%s] Classifiers: %s (%s)' % (self.user, datetime.datetime.now() - now, classifier_feeds.count() + classifier_authors.count() + classifier_tags.count() + classifier_titles.count())) scores = { 'feed': apply_classifier_feeds(classifier_feeds, self.feed), } for story in unread_stories: scores.update({ 'author' : apply_classifier_authors(classifier_authors, story), 'tags' : apply_classifier_tags(classifier_tags, story), 'title' : apply_classifier_titles(classifier_titles, story), }) max_score = max(scores['author'], scores['tags'], scores['title']) min_score = min(scores['author'], scores['tags'], scores['title']) if max_score > 0: feed_scores['positive'] += 1 elif min_score < 0: feed_scores['negative'] += 1 else: if scores['feed'] > 0: feed_scores['positive'] += 1 elif scores['feed'] < 0: feed_scores['negative'] += 1 else: feed_scores['neutral'] += 1 # if not silent: # logging.info(' ---> [%s] End classifiers: %s' % (self.user, datetime.datetime.now() - now)) self.unread_count_positive = feed_scores['positive'] self.unread_count_neutral = feed_scores['neutral'] self.unread_count_negative = feed_scores['negative'] self.unread_count_updated = datetime.datetime.now() self.oldest_unread_story_date = oldest_unread_story_date self.needs_unread_recalc = False self.save() if (self.unread_count_positive == 0 and self.unread_count_neutral == 0): self.mark_feed_read() if not silent: logging.user(self.user, '~FC~SNComputing scores: %s (~SB%s~SN/~SB%s~SN/~SB%s~SN)' % (self.feed, feed_scores['negative'], feed_scores['neutral'], feed_scores['positive'])) return self
def get_stories(self, offset=0, limit=6, order='newest', read_filter='all', withscores=False, hashes_only=False): r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL) ignore_user_stories = False stories_key = 'F:%s' % (self.feed_id) read_stories_key = 'RS:%s:%s' % (self.user_id, self.feed_id) unread_stories_key = 'U:%s:%s' % (self.user_id, self.feed_id) unread_ranked_stories_key = 'z%sU:%s:%s' % ('h' if hashes_only else '', self.user_id, self.feed_id) if offset and not withscores and r.exists(unread_ranked_stories_key): pass else: r.delete(unread_ranked_stories_key) if not r.exists(stories_key): print " ---> No stories on feed: %s" % self return [] elif read_filter != 'unread' or not r.exists(read_stories_key): ignore_user_stories = True unread_stories_key = stories_key else: r.sdiffstore(unread_stories_key, stories_key, read_stories_key) sorted_stories_key = 'zF:%s' % (self.feed_id) r.zinterstore(unread_ranked_stories_key, [sorted_stories_key, unread_stories_key]) current_time = int(time.time() + 60*60*24) if order == 'oldest': byscorefunc = r.zrangebyscore if read_filter == 'unread': min_score = int(time.mktime(self.mark_read_date.timetuple())) + 1 else: now = datetime.datetime.now() two_weeks_ago = now - datetime.timedelta(days=settings.DAYS_OF_UNREAD) min_score = int(time.mktime(two_weeks_ago.timetuple()))-1000 max_score = current_time else: byscorefunc = r.zrevrangebyscore min_score = current_time if read_filter == 'unread': # +1 for the intersection b/w zF and F, which carries an implicit score of 1. max_score = int(time.mktime(self.mark_read_date.timetuple())) + 1 else: max_score = 0 if settings.DEBUG: debug_stories = r.zrevrange(unread_ranked_stories_key, 0, -1, withscores=True) print " ---> Unread all stories (%s - %s) %s stories: %s" % ( min_score, max_score, len(debug_stories), debug_stories) story_ids = byscorefunc(unread_ranked_stories_key, min_score, max_score, start=offset, num=500, withscores=withscores)[:limit] r.expire(unread_ranked_stories_key, 24*60*60) if not ignore_user_stories: r.delete(unread_stories_key) if withscores or hashes_only: return story_ids elif story_ids: story_date_order = "%sstory_date" % ('' if order == 'oldest' else '-') mstories = MStory.objects(story_hash__in=story_ids).order_by(story_date_order) stories = Feed.format_stories(mstories) return stories else: return []
#!/usr/bin/env python from utils.munin.base import MuninGraph from apps.rss_feeds.models import MStory from apps.reader.models import MUserStory graph_config = { 'graph_category': 'NewsBlur', 'graph_title': 'NewsBlur Stories', 'graph_vlabel': 'Stories', 'stories.label': 'stories', 'tags.label': 'tags', 'authors.label': 'authors', 'read_stories.label': 'read_stories', } metrics = { 'stories': MStory.objects().count(), 'read_stories': MUserStory.objects().count(), } if __name__ == '__main__': MuninGraph(graph_config, metrics).run()
class UserSubscription(models.Model): """ A feed which a user has subscrubed to. Carries all of the cached information about the subscription, including unread counts of the three primary scores. Also has a dirty flag (needs_unread_recalc) which means that the unread counts are not accurate and need to be calculated with `self.calculate_feed_scores()`. """ UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta( days=settings.DAYS_OF_UNREAD) user = models.ForeignKey(User, related_name='subscriptions') feed = models.ForeignKey(Feed, related_name='subscribers') user_title = models.CharField(max_length=255, null=True, blank=True) active = models.BooleanField(default=False) last_read_date = models.DateTimeField(default=UNREAD_CUTOFF) mark_read_date = models.DateTimeField(default=UNREAD_CUTOFF) unread_count_neutral = models.IntegerField(default=0) unread_count_positive = models.IntegerField(default=0) unread_count_negative = models.IntegerField(default=0) unread_count_updated = models.DateTimeField(default=datetime.datetime.now) oldest_unread_story_date = models.DateTimeField( default=datetime.datetime.now) needs_unread_recalc = models.BooleanField(default=False) feed_opens = models.IntegerField(default=0) is_trained = models.BooleanField(default=False) objects = UserSubscriptionManager() def __unicode__(self): return '[%s (%s): %s (%s)] ' % (self.user.username, self.user.pk, self.feed.feed_title, self.feed.pk) class Meta: unique_together = ("user", "feed") def canonical(self, full=False, include_favicon=True, classifiers=None): feed = self.feed.canonical(full=full, include_favicon=include_favicon) feed['feed_title'] = self.user_title or feed['feed_title'] feed['ps'] = self.unread_count_positive feed['nt'] = self.unread_count_neutral feed['ng'] = self.unread_count_negative feed['active'] = self.active feed['feed_opens'] = self.feed_opens feed['subscribed'] = True if classifiers: feed['classifiers'] = classifiers if not self.active and self.user.profile.is_premium: feed['active'] = True self.active = True self.save() return feed def save(self, *args, **kwargs): user_title_max = self._meta.get_field('user_title').max_length if self.user_title and len(self.user_title) > user_title_max: self.user_title = self.user_title[:user_title_max] if not self.active and self.user.profile.is_premium: self.active = True try: super(UserSubscription, self).save(*args, **kwargs) except IntegrityError: duplicate_feeds = DuplicateFeed.objects.filter( duplicate_feed_id=self.feed_id) for duplicate_feed in duplicate_feeds: already_subscribed = UserSubscription.objects.filter( user=self.user, feed=duplicate_feed.feed) if not already_subscribed: self.feed = duplicate_feed.feed super(UserSubscription, self).save(*args, **kwargs) break else: self.delete() @classmethod def sync_all_redis(cls, user_id, skip_feed=False): us = cls.objects.filter(user=user_id) for sub in us: print " ---> Syncing usersub: %s" % sub sub.sync_redis(skip_feed=skip_feed) def sync_redis(self, skip_feed=False): r = redis.Redis(connection_pool=settings.REDIS_STORY_POOL) if not skip_feed: self.feed.sync_redis() userstories = MUserStory.objects.filter(feed_id=self.feed_id, user_id=self.user_id) for userstory in userstories: userstory.sync_redis(r=r) def get_stories(self, offset=0, limit=6, order='newest', read_filter='all', withscores=False): r = redis.Redis(connection_pool=settings.REDIS_STORY_POOL) ignore_user_stories = False stories_key = 'F:%s' % (self.feed_id) read_stories_key = 'RS:%s:%s' % (self.user_id, self.feed_id) unread_stories_key = 'U:%s:%s' % (self.user_id, self.feed_id) if not r.exists(stories_key): print " ---> No stories on feed: %s" % self return [] elif read_filter != 'unread' or not r.exists(read_stories_key): ignore_user_stories = True unread_stories_key = stories_key else: r.sdiffstore(unread_stories_key, stories_key, read_stories_key) sorted_stories_key = 'zF:%s' % (self.feed_id) unread_ranked_stories_key = 'zU:%s:%s' % (self.user_id, self.feed_id) r.zinterstore(unread_ranked_stories_key, [sorted_stories_key, unread_stories_key]) current_time = int(time.time() + 60 * 60 * 24) if order == 'oldest': byscorefunc = r.zrangebyscore if read_filter == 'unread' or True: min_score = int(time.mktime( self.mark_read_date.timetuple())) + 1 else: now = datetime.datetime.now() two_weeks_ago = now - datetime.timedelta( days=settings.DAYS_OF_UNREAD) min_score = int(time.mktime(two_weeks_ago.timetuple())) - 1000 max_score = current_time else: byscorefunc = r.zrevrangebyscore min_score = current_time # +1 for the intersection b/w zF and F, which carries an implicit score of 1. max_score = int(time.mktime(self.mark_read_date.timetuple())) + 1 if settings.DEBUG: print " ---> Unread all stories: %s" % r.zrevrange( unread_ranked_stories_key, 0, -1) story_ids = byscorefunc(unread_ranked_stories_key, min_score, max_score, start=offset, num=limit, withscores=withscores) r.expire(unread_ranked_stories_key, 24 * 60 * 60) if not ignore_user_stories: r.delete(unread_stories_key) # XXX TODO: Remove below line after combing redis for these None's. story_ids = [s for s in story_ids if s and s != 'None'] # ugh, hack return story_ids @classmethod def feed_stories(cls, user_id, feed_ids, offset=0, limit=6, order='newest', read_filter='all'): r = redis.Redis(connection_pool=settings.REDIS_STORY_POOL) if order == 'oldest': range_func = r.zrange else: range_func = r.zrevrange if not isinstance(feed_ids, list): feed_ids = [feed_ids] unread_ranked_stories_keys = 'zU:%s' % (user_id) if offset and r.exists(unread_ranked_stories_keys): story_guids = range_func(unread_ranked_stories_keys, offset, limit) return story_guids else: r.delete(unread_ranked_stories_keys) for feed_id in feed_ids: try: us = cls.objects.get(user=user_id, feed=feed_id) except cls.DoesNotExist: continue story_guids = us.get_stories(offset=0, limit=200, order=order, read_filter=read_filter, withscores=True) if story_guids: r.zadd(unread_ranked_stories_keys, **dict(story_guids)) story_guids = range_func(unread_ranked_stories_keys, offset, limit) r.expire(unread_ranked_stories_keys, 24 * 60 * 60) return story_guids @classmethod def add_subscription(cls, user, feed_address, folder=None, bookmarklet=False, auto_active=True, skip_fetch=False): feed = None us = None logging.user( user, "~FRAdding URL: ~SB%s (in %s) %s" % (feed_address, folder, "~FCAUTO-ADD" if not auto_active else "")) feed = Feed.get_feed_from_url(feed_address) if not feed: code = -1 if bookmarklet: message = "This site does not have an RSS feed. Nothing is linked to from this page." else: message = "This address does not point to an RSS feed or a website with an RSS feed." else: us, subscription_created = cls.objects.get_or_create( feed=feed, user=user, defaults={ 'needs_unread_recalc': True, 'active': auto_active, }) code = 1 message = "" if us: user_sub_folders_object, created = UserSubscriptionFolders.objects.get_or_create( user=user, defaults={'folders': '[]'}) if created: user_sub_folders = [] else: user_sub_folders = json.decode(user_sub_folders_object.folders) user_sub_folders = add_object_to_folder(feed.pk, folder, user_sub_folders) user_sub_folders_object.folders = json.encode(user_sub_folders) user_sub_folders_object.save() if auto_active or user.profile.is_premium: us.active = True us.save() if not skip_fetch and feed.last_update < datetime.datetime.utcnow( ) - datetime.timedelta(days=1): feed = feed.update() from apps.social.models import MActivity MActivity.new_feed_subscription(user_id=user.pk, feed_id=feed.pk, feed_title=feed.title) feed.setup_feed_for_premium_subscribers() return code, message, us @classmethod def feeds_with_updated_counts(cls, user, feed_ids=None, check_fetch_status=False): feeds = {} # Get subscriptions for user user_subs = cls.objects.select_related('feed').filter(user=user, active=True) feed_ids = [f for f in feed_ids if f and not f.startswith('river')] if feed_ids: user_subs = user_subs.filter(feed__in=feed_ids) UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta( days=settings.DAYS_OF_UNREAD) for i, sub in enumerate(user_subs): # Count unreads if subscription is stale. if (sub.needs_unread_recalc or sub.unread_count_updated < UNREAD_CUTOFF or sub.oldest_unread_story_date < UNREAD_CUTOFF): sub = sub.calculate_feed_scores(silent=True) if not sub: continue # TODO: Figure out the correct sub and give it a new feed_id feed_id = sub.feed_id feeds[feed_id] = { 'ps': sub.unread_count_positive, 'nt': sub.unread_count_neutral, 'ng': sub.unread_count_negative, 'id': feed_id, } if not sub.feed.fetched_once or check_fetch_status: feeds[feed_id]['fetched_once'] = sub.feed.fetched_once feeds[feed_id][ 'not_yet_fetched'] = not sub.feed.fetched_once # Legacy. Dammit. if sub.feed.favicon_fetching: feeds[feed_id]['favicon_fetching'] = True if sub.feed.has_feed_exception or sub.feed.has_page_exception: feeds[feed_id]['has_exception'] = True feeds[feed_id][ 'exception_type'] = 'feed' if sub.feed.has_feed_exception else 'page' feeds[feed_id]['feed_address'] = sub.feed.feed_address feeds[feed_id]['exception_code'] = sub.feed.exception_code return feeds def mark_feed_read(self): now = datetime.datetime.utcnow() # Use the latest story to get last read time. latest_story = MStory.objects(story_feed_id=self.feed.pk).order_by( '-story_date').only('story_date').limit(1) if latest_story and len(latest_story) >= 1: latest_story_date = latest_story[0]['story_date']\ + datetime.timedelta(seconds=1) else: latest_story_date = now self.last_read_date = latest_story_date self.mark_read_date = latest_story_date self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = now self.oldest_unread_story_date = now self.needs_unread_recalc = False # No longer removing old user read stories, since they're needed for social, # and they get cleaned up automatically when new stories come in. # MUserStory.delete_old_stories(self.user_id, self.feed_id) self.save() def mark_story_ids_as_read(self, story_ids, request=None): data = dict(code=0, payload=story_ids) if not request: request = self.user if not self.needs_unread_recalc: self.needs_unread_recalc = True self.save() if len(story_ids) > 1: logging.user( request, "~FYRead %s stories in feed: %s" % (len(story_ids), self.feed)) else: logging.user(request, "~FYRead story in feed: %s" % (self.feed)) for story_id in set(story_ids): try: story = MStory.objects.get(story_feed_id=self.feed_id, story_guid=story_id) except MStory.DoesNotExist: # Story has been deleted, probably by feed_fetcher. continue except MStory.MultipleObjectsReturned: story = MStory.objects.filter(story_feed_id=self.feed_id, story_guid=story_id)[0] now = datetime.datetime.utcnow() date = now if now > story.story_date else story.story_date # For handling future stories m, _ = MUserStory.objects.get_or_create(story=story, user_id=self.user_id, feed_id=self.feed_id, defaults={ 'read_date': date, 'story_id': story_id, 'story_date': story.story_date, }) return data def calculate_feed_scores(self, silent=False, stories_db=None): # now = datetime.datetime.strptime("2009-07-06 22:30:03", "%Y-%m-%d %H:%M:%S") now = datetime.datetime.now() UNREAD_CUTOFF = now - datetime.timedelta(days=settings.DAYS_OF_UNREAD) if self.user.profile.last_seen_on < UNREAD_CUTOFF: # if not silent: # logging.info(' ---> [%s] SKIPPING Computing scores: %s (1 week+)' % (self.user, self.feed)) return if not self.feed.fetched_once: if not silent: logging.info(' ---> [%s] NOT Computing scores: %s' % (self.user, self.feed)) self.needs_unread_recalc = False self.save() return feed_scores = dict(negative=0, neutral=0, positive=0) # Two weeks in age. If mark_read_date is older, mark old stories as read. date_delta = UNREAD_CUTOFF if date_delta < self.mark_read_date: date_delta = self.mark_read_date else: self.mark_read_date = date_delta read_stories = MUserStory.objects(user_id=self.user_id, feed_id=self.feed_id, read_date__gte=self.mark_read_date) # if not silent: # logging.info(' ---> [%s] Read stories: %s' % (self.user, datetime.datetime.now() - now)) read_stories_ids = [us.story_id for us in read_stories] stories_db = stories_db or MStory.objects(story_feed_id=self.feed_id, story_date__gte=date_delta) # if not silent: # logging.info(' ---> [%s] MStory: %s' % (self.user, datetime.datetime.now() - now)) oldest_unread_story_date = now unread_stories_db = [] for story in stories_db: if story.story_date < date_delta: continue if hasattr( story, 'story_guid') and story.story_guid not in read_stories_ids: unread_stories_db.append(story) if story.story_date < oldest_unread_story_date: oldest_unread_story_date = story.story_date stories = Feed.format_stories(unread_stories_db, self.feed_id) # if not silent: # logging.info(' ---> [%s] Format stories: %s' % (self.user, datetime.datetime.now() - now)) classifier_feeds = list( MClassifierFeed.objects(user_id=self.user_id, feed_id=self.feed_id, social_user_id=0)) classifier_authors = list( MClassifierAuthor.objects(user_id=self.user_id, feed_id=self.feed_id)) classifier_titles = list( MClassifierTitle.objects(user_id=self.user_id, feed_id=self.feed_id)) classifier_tags = list( MClassifierTag.objects(user_id=self.user_id, feed_id=self.feed_id)) # if not silent: # logging.info(' ---> [%s] Classifiers: %s (%s)' % (self.user, datetime.datetime.now() - now, classifier_feeds.count() + classifier_authors.count() + classifier_tags.count() + classifier_titles.count())) scores = { 'feed': apply_classifier_feeds(classifier_feeds, self.feed), } for story in stories: scores.update({ 'author': apply_classifier_authors(classifier_authors, story), 'tags': apply_classifier_tags(classifier_tags, story), 'title': apply_classifier_titles(classifier_titles, story), }) max_score = max(scores['author'], scores['tags'], scores['title']) min_score = min(scores['author'], scores['tags'], scores['title']) if max_score > 0: feed_scores['positive'] += 1 elif min_score < 0: feed_scores['negative'] += 1 else: if scores['feed'] > 0: feed_scores['positive'] += 1 elif scores['feed'] < 0: feed_scores['negative'] += 1 else: feed_scores['neutral'] += 1 # if not silent: # logging.info(' ---> [%s] End classifiers: %s' % (self.user, datetime.datetime.now() - now)) self.unread_count_positive = feed_scores['positive'] self.unread_count_neutral = feed_scores['neutral'] self.unread_count_negative = feed_scores['negative'] self.unread_count_updated = datetime.datetime.now() self.oldest_unread_story_date = oldest_unread_story_date self.needs_unread_recalc = False self.save() if (self.unread_count_positive == 0 and self.unread_count_neutral == 0 and self.unread_count_negative == 0): self.mark_feed_read() if not silent: logging.info(' ---> [%s] Computing scores: %s (%s/%s/%s)' % (self.user, self.feed, feed_scores['negative'], feed_scores['neutral'], feed_scores['positive'])) return self def switch_feed(self, new_feed, old_feed): # Rewrite feed in subscription folders try: user_sub_folders = UserSubscriptionFolders.objects.get( user=self.user) except Exception, e: logging.info(" *** ---> UserSubscriptionFolders error: %s" % e) return # Switch to original feed for the user subscription logging.info(" ===> %s " % self.user) self.feed = new_feed self.needs_unread_recalc = True try: self.save() user_sub_folders.rewrite_feed(new_feed, old_feed) except (IntegrityError, OperationError): logging.info(" !!!!> %s already subscribed" % self.user) self.delete() return # Switch read stories user_stories = MUserStory.objects(user_id=self.user_id, feed_id=old_feed.pk) if user_stories.count() > 0: logging.info(" ---> %s read stories" % user_stories.count()) for user_story in user_stories: user_story.feed_id = new_feed.pk duplicate_story = user_story.story story_guid = duplicate_story.story_guid if hasattr( duplicate_story, 'story_guid') else duplicate_story.id original_story = MStory.objects(story_feed_id=new_feed.pk, story_guid=story_guid) if original_story: user_story.story = original_story[0] try: user_story.save() except OperationError: # User read the story in the original feed, too. Ugh, just ignore it. pass else: logging.info(" ***> Can't find original story: %s" % duplicate_story.id) user_story.delete() def switch_feed_for_classifier(model): duplicates = model.objects(feed_id=old_feed.pk, user_id=self.user_id) if duplicates.count(): logging.info(" ---> Switching %s %s" % (duplicates.count(), model)) for duplicate in duplicates: duplicate.feed_id = new_feed.pk try: duplicate.save() pass except (IntegrityError, OperationError): logging.info(" !!!!> %s already exists" % duplicate) duplicate.delete() switch_feed_for_classifier(MClassifierTitle) switch_feed_for_classifier(MClassifierAuthor) switch_feed_for_classifier(MClassifierFeed) switch_feed_for_classifier(MClassifierTag)
def calculate_feed_scores(self, silent=False, stories_db=None): now = datetime.datetime.utcnow() UNREAD_CUTOFF = now - datetime.timedelta(days=settings.DAYS_OF_UNREAD) if self.user.profile.last_seen_on < UNREAD_CUTOFF: # if not silent: # logging.info(' ---> [%s] SKIPPING Computing scores: %s (1 week+)' % (self.user, self.feed)) return if not self.feed.fetched_once: if not silent: logging.info(' ---> [%s] NOT Computing scores: %s' % (self.user, self.feed)) self.needs_unread_recalc = False self.save() return if not silent: logging.info(' ---> [%s] Computing scores: %s' % (self.user, self.feed)) feed_scores = dict(negative=0, neutral=0, positive=0) # Two weeks in age. If mark_read_date is older, mark old stories as read. date_delta = UNREAD_CUTOFF if date_delta < self.mark_read_date: date_delta = self.mark_read_date else: self.mark_read_date = date_delta read_stories = MUserStory.objects(user_id=self.user.pk, feed_id=self.feed.pk, read_date__gte=self.mark_read_date) # if not silent: # logging.info(' ---> [%s] Read stories: %s' % (self.user, datetime.datetime.now() - now)) read_stories_ids = [] for us in read_stories: if hasattr(us.story, 'story_guid') and isinstance(us.story.story_guid, unicode): read_stories_ids.append(us.story.story_guid) elif hasattr(us.story, 'id') and isinstance(us.story.id, unicode): read_stories_ids.append(us.story.id) # TODO: Remove me after migration from story.id->guid stories_db = stories_db or MStory.objects(story_feed_id=self.feed.pk, story_date__gte=date_delta) # if not silent: # logging.info(' ---> [%s] MStory: %s' % (self.user, datetime.datetime.now() - now)) oldest_unread_story_date = now unread_stories_db = [] for story in stories_db: if story.story_date < date_delta: continue if hasattr(story, 'story_guid') and story.story_guid not in read_stories_ids: unread_stories_db.append(story) if story.story_date < oldest_unread_story_date: oldest_unread_story_date = story.story_date stories = Feed.format_stories(unread_stories_db, self.feed.pk) # if not silent: # logging.info(' ---> [%s] Format stories: %s' % (self.user, datetime.datetime.now() - now)) classifier_feeds = list(MClassifierFeed.objects(user_id=self.user.pk, feed_id=self.feed.pk)) classifier_authors = list(MClassifierAuthor.objects(user_id=self.user.pk, feed_id=self.feed.pk)) classifier_titles = list(MClassifierTitle.objects(user_id=self.user.pk, feed_id=self.feed.pk)) classifier_tags = list(MClassifierTag.objects(user_id=self.user.pk, feed_id=self.feed.pk)) # if not silent: # logging.info(' ---> [%s] Classifiers: %s (%s)' % (self.user, datetime.datetime.now() - now, classifier_feeds.count() + classifier_authors.count() + classifier_tags.count() + classifier_titles.count())) scores = { 'feed': apply_classifier_feeds(classifier_feeds, self.feed), } for story in stories: scores.update({ 'author' : apply_classifier_authors(classifier_authors, story), 'tags' : apply_classifier_tags(classifier_tags, story), 'title' : apply_classifier_titles(classifier_titles, story), }) max_score = max(scores['author'], scores['tags'], scores['title']) min_score = min(scores['author'], scores['tags'], scores['title']) if max_score > 0: feed_scores['positive'] += 1 elif min_score < 0: feed_scores['negative'] += 1 else: if scores['feed'] > 0: feed_scores['positive'] += 1 elif scores['feed'] < 0: feed_scores['negative'] += 1 else: feed_scores['neutral'] += 1 # if not silent: # logging.info(' ---> [%s] End classifiers: %s' % (self.user, datetime.datetime.now() - now)) self.unread_count_positive = feed_scores['positive'] self.unread_count_neutral = feed_scores['neutral'] self.unread_count_negative = feed_scores['negative'] self.unread_count_updated = datetime.datetime.now() self.oldest_unread_story_date = oldest_unread_story_date self.needs_unread_recalc = False self.save() # if (self.unread_count_positive == 0 and # self.unread_count_neutral == 0): # self.mark_feed_read() cache.delete('usersub:%s' % self.user.id) return
def api_unread_story(request, trigger_slug=None): user = request.user body = request.body_json after = body.get('after', None) before = body.get('before', None) limit = body.get('limit', 50) fields = body.get('triggerFields') feed_or_folder = fields['feed_or_folder'] entries = [] if isinstance(feed_or_folder, int) or feed_or_folder.isdigit(): feed_id = int(feed_or_folder) try: usersub = UserSubscription.objects.get(user=user, feed_id=feed_id) except UserSubscription.DoesNotExist: return dict(data=[]) found_feed_ids = [feed_id] found_trained_feed_ids = [feed_id] if usersub.is_trained else [] stories = usersub.get_stories(order="newest", read_filter="unread", offset=0, limit=limit, default_cutoff_date=user.profile.unread_cutoff) else: folder_title = feed_or_folder if folder_title == "Top Level": folder_title = " " usf = UserSubscriptionFolders.objects.get(user=user) flat_folders = usf.flatten_folders() feed_ids = None if folder_title != "all": feed_ids = flat_folders.get(folder_title) usersubs = UserSubscription.subs_for_feeds(user.pk, feed_ids=feed_ids, read_filter="unread") feed_ids = [sub.feed_id for sub in usersubs] params = { "user_id": user.pk, "feed_ids": feed_ids, "offset": 0, "limit": limit, "order": "newest", "read_filter": "unread", "usersubs": usersubs, "cutoff_date": user.profile.unread_cutoff, } story_hashes, unread_feed_story_hashes = UserSubscription.feed_stories(**params) mstories = MStory.objects(story_hash__in=story_hashes).order_by('-story_date') stories = Feed.format_stories(mstories) found_feed_ids = list(set([story['story_feed_id'] for story in stories])) trained_feed_ids = [sub.feed_id for sub in usersubs if sub.is_trained] found_trained_feed_ids = list(set(trained_feed_ids) & set(found_feed_ids)) if found_trained_feed_ids: classifier_feeds = list(MClassifierFeed.objects(user_id=user.pk, feed_id__in=found_trained_feed_ids)) classifier_authors = list(MClassifierAuthor.objects(user_id=user.pk, feed_id__in=found_trained_feed_ids)) classifier_titles = list(MClassifierTitle.objects(user_id=user.pk, feed_id__in=found_trained_feed_ids)) classifier_tags = list(MClassifierTag.objects(user_id=user.pk, feed_id__in=found_trained_feed_ids)) feeds = dict([(f.pk, { "title": f.feed_title, "website": f.feed_link, "address": f.feed_address, }) for f in Feed.objects.filter(pk__in=found_feed_ids)]) for story in stories: if before and int(story['story_date'].strftime("%s")) > before: continue if after and int(story['story_date'].strftime("%s")) < after: continue score = 0 if found_trained_feed_ids and story['story_feed_id'] in found_trained_feed_ids: score = compute_story_score(story, classifier_titles=classifier_titles, classifier_authors=classifier_authors, classifier_tags=classifier_tags, classifier_feeds=classifier_feeds) if score < 0: continue if trigger_slug == "new-unread-focus-story" and score < 1: continue feed = feeds.get(story['story_feed_id'], None) entries.append({ "StoryTitle": story['story_title'], "StoryContent": story['story_content'], "StoryURL": story['story_permalink'], "StoryAuthor": story['story_authors'], "PublishedAt": story['story_date'].strftime("%Y-%m-%dT%H:%M:%SZ"), "StoryScore": score, "Site": feed and feed['title'], "SiteURL": feed and feed['website'], "SiteRSS": feed and feed['address'], "meta": { "id": story['story_hash'], "timestamp": int(story['story_date'].strftime("%s")) }, }) if after: entries = sorted(entries, key=lambda s: s['meta']['timestamp']) logging.user(request, "~FYChecking unread%s stories with ~SB~FCIFTTT~SN~FY: ~SB%s~SN - ~SB%s~SN stories" % (" ~SBfocus~SN" if trigger_slug == "new-unread-focus-story" else "", feed_or_folder, len(entries))) return {"data": entries[:limit]}
self._state[doc][position] = new_topic def change_count(self, doc, word, topic, delta): self._docs[doc].inc(topic, delta) self._topics[topic].inc(word, delta) def sample(self, iterations=100, hyper_delay=10): assert self._state for ii in xrange(iterations): for dd in self._data: for ww in xrange(len(self._data[dd])): self.sample_word(dd, ww) print("Iteration %i %f" % (ii, self.lhood(self._alpha, self._lambda))) if hyper_delay >= 0 and ii % hyper_delay == 0: self.optimize_hyperparameters() def print_topics(self, num_words=15): for ii in self._topics: print("%i:%s\n" % (ii, "\t".join(self._topics[ii].keys()[:num_words]))) if __name__ == "__main__": stories = MStory.objects(story_feed_id=199) d = create_data(stories, doc_limit=250, delimiter="") lda = LdaSampler(5) lda.initialize(d) lda.sample(50) lda.print_topics()
def api_share_new_story(request): user = request.user body = request.body_json fields = body.get('actionFields') story_url = urlnorm.normalize(fields['story_url']) story_content = fields.get('story_content', "") story_title = fields.get('story_title', "") story_author = fields.get('story_author', "") comments = fields.get('comments', None) logging.user(request.user, "~FBFinding feed (api_share_new_story): %s" % story_url) original_feed = Feed.get_feed_from_url(story_url, create=True, fetch=True) story_hash = MStory.guid_hash_unsaved(story_url) feed_id = (original_feed and original_feed.pk or 0) if not user.profile.is_premium and MSharedStory.feed_quota(user.pk, story_hash, feed_id=feed_id): return {"errors": [{ 'message': 'Only premium users can share multiple stories per day from the same site.' }]} quota = 3 if MSharedStory.feed_quota(user.pk, story_hash, quota=quota): logging.user(request, "~BM~FRNOT ~FYSharing story from ~SB~FCIFTTT~FY, over quota: ~SB%s: %s" % (story_url, comments)) return {"errors": [{ 'message': 'You can only share %s stories per day.' % quota }]} if not story_content or not story_title: ti = TextImporter(feed=original_feed, story_url=story_url, request=request) original_story = ti.fetch(return_document=True) if original_story: story_url = original_story['url'] if not story_content: story_content = original_story['content'] if not story_title: story_title = original_story['title'] if story_content: story_content = lxml.html.fromstring(story_content) story_content.make_links_absolute(story_url) story_content = lxml.html.tostring(story_content) shared_story = MSharedStory.objects.filter(user_id=user.pk, story_feed_id=original_feed and original_feed.pk or 0, story_guid=story_url).limit(1).first() if not shared_story: title_max = MSharedStory._fields['story_title'].max_length story_db = { "story_guid": story_url, "story_permalink": story_url, "story_title": story_title and story_title[:title_max] or "[Untitled]", "story_feed_id": original_feed and original_feed.pk or 0, "story_content": story_content, "story_author_name": story_author, "story_date": datetime.datetime.now(), "user_id": user.pk, "comments": comments, "has_comments": bool(comments), } try: shared_story = MSharedStory.objects.create(**story_db) socialsubs = MSocialSubscription.objects.filter(subscription_user_id=user.pk) for socialsub in socialsubs: socialsub.needs_unread_recalc = True socialsub.save() logging.user(request, "~BM~FYSharing story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments)) except NotUniqueError: logging.user(request, "~BM~FY~SBAlready~SN shared story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments)) else: logging.user(request, "~BM~FY~SBAlready~SN shared story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments)) try: socialsub = MSocialSubscription.objects.get(user_id=user.pk, subscription_user_id=user.pk) except MSocialSubscription.DoesNotExist: socialsub = None if socialsub and shared_story: socialsub.mark_story_ids_as_read([shared_story.story_hash], shared_story.story_feed_id, request=request) elif shared_story: RUserStory.mark_read(user.pk, shared_story.story_feed_id, shared_story.story_hash) if shared_story: shared_story.publish_update_to_subscribers() return {"data": [{ "id": shared_story and shared_story.story_guid, "url": shared_story and shared_story.blurblog_permalink() }]}
def receive_newsletter(self, params): user = self._user_from_email(params['recipient']) if not user: return sender_name, sender_username, sender_domain = self._split_sender( params['from']) feed_address = self._feed_address( user, "%s@%s" % (sender_username, sender_domain)) try: usf = UserSubscriptionFolders.objects.get(user=user) except UserSubscriptionFolders.DoesNotExist: logging.user(user, "~FRUser does not have a USF, ignoring newsletter.") return usf.add_folder('', 'Newsletters') # First look for the email address try: feed = Feed.objects.get(feed_address=feed_address) except Feed.MultipleObjectsReturned: feeds = Feed.objects.filter(feed_address=feed_address)[:1] if feeds.count(): feed = feeds[0] except Feed.DoesNotExist: feed = None # If not found, check among titles user has subscribed to if not feed: newsletter_subs = UserSubscription.objects.filter( user=user, feed__feed_address__contains="newsletter:").only('feed') newsletter_feed_ids = [us.feed.pk for us in newsletter_subs] feeds = Feed.objects.filter(feed_title__iexact=sender_name, pk__in=newsletter_feed_ids) if feeds.count(): feed = feeds[0] # Create a new feed if it doesn't exist by sender name or email if not feed: feed = Feed.objects.create(feed_address=feed_address, feed_link='http://' + sender_domain, feed_title=sender_name, fetched_once=True, known_good=True) feed.update() logging.user(user, "~FCCreating newsletter feed: ~SB%s" % (feed)) r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL) r.publish(user.username, 'reload:%s' % feed.pk) self._check_if_first_newsletter(user) feed.last_update = datetime.datetime.now() feed.last_story_date = datetime.datetime.now() feed.save() if feed.feed_title != sender_name: feed.feed_title = sender_name feed.save() try: usersub = UserSubscription.objects.get(user=user, feed=feed) except UserSubscription.DoesNotExist: _, _, usersub = UserSubscription.add_subscription( user=user, feed_address=feed_address, folder='Newsletters') r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL) r.publish(user.username, 'reload:feeds') story_hash = MStory.ensure_story_hash(params['signature'], feed.pk) story_content = self._get_content(params) plain_story_content = self._get_content(params, force_plain=True) if len(plain_story_content) > len(story_content): story_content = plain_story_content story_content = self._clean_content(story_content) story_params = { "story_feed_id": feed.pk, "story_date": datetime.datetime.fromtimestamp(int(params['timestamp'])), "story_title": params['subject'], "story_content": story_content, "story_author_name": params['from'], "story_permalink": "https://%s%s" % (Site.objects.get_current().domain, reverse('newsletter-story', kwargs={'story_hash': story_hash})), "story_guid": params['signature'], } try: story = MStory.objects.get(story_hash=story_hash) except MStory.DoesNotExist: story = MStory(**story_params) story.save() usersub.needs_unread_recalc = True usersub.save() self._publish_to_subscribers(feed, story.story_hash) MFetchHistory.add(feed_id=feed.pk, fetch_type='push') logging.user( user, "~FCNewsletter feed story: ~SB%s~SN / ~SB%s" % (story.story_title, feed)) return story
def load_river_stories(request): limit = 18 offset = int(request.REQUEST.get('offset', 0)) start = time.time() user = get_user(request) feed_ids = [int(feed_id) for feed_id in request.REQUEST.getlist('feeds') if feed_id] original_feed_ids = list(feed_ids) page = int(request.REQUEST.get('page', 1)) read_stories_count = int(request.REQUEST.get('read_stories_count', 0)) days_to_keep_unreads = datetime.timedelta(days=settings.DAYS_OF_UNREAD) if not feed_ids: logging.user(request, "~FCLoading empty river stories: page %s" % (page)) return dict(stories=[]) # Fetch all stories at and before the page number. # Not a single page, because reading stories can move them up in the unread order. # `read_stories_count` is an optimization, works best when all 25 stories before have been read. offset = (page-1) * limit - read_stories_count limit = page * limit - read_stories_count # Read stories to exclude read_stories = MUserStory.objects(user_id=user.pk, feed_id__in=feed_ids).only('story_id') read_stories = [rs.story_id for rs in read_stories] # Determine mark_as_read dates for all feeds to ignore all stories before this date. feed_counts = {} feed_last_reads = {} for feed_id in feed_ids: try: usersub = UserSubscription.objects.get(feed__pk=feed_id, user=user) except UserSubscription.DoesNotExist: continue if not usersub: continue feed_counts[feed_id] = (usersub.unread_count_negative * 1 + usersub.unread_count_neutral * 10 + usersub.unread_count_positive * 20) feed_last_reads[feed_id] = int(time.mktime(usersub.mark_read_date.timetuple())) feed_counts = sorted(feed_counts.items(), key=itemgetter(1))[:40] feed_ids = [f[0] for f in feed_counts] feed_last_reads = dict([(str(feed_id), feed_last_reads[feed_id]) for feed_id in feed_ids if feed_id in feed_last_reads]) feed_counts = dict(feed_counts) # After excluding read stories, all that's left are stories # past the mark_read_date. Everything returned is guaranteed to be unread. mstories = MStory.objects( story_guid__nin=read_stories, story_feed_id__in=feed_ids, # story_date__gte=start - days_to_keep_unreads ).map_reduce("""function() { var d = feed_last_reads[this[~story_feed_id]]; if (this[~story_date].getTime()/1000 > d) { emit(this[~id], this); } }""", """function(key, values) { return values[0]; }""", output='inline', scope={ 'feed_last_reads': feed_last_reads } ) try: mstories = [story.value for story in mstories if story and story.value] except OperationFailure, e: raise e
class ProcessFeed: def __init__(self, feed_id, fpf, options, raw_feed=None): self.feed_id = feed_id self.options = options self.fpf = fpf self.raw_feed = raw_feed def refresh_feed(self): self.feed = Feed.get_by_id(self.feed_id) if self.feed_id != self.feed.pk: logging.debug(" ***> Feed has changed: from %s to %s" % (self.feed_id, self.feed.pk)) self.feed_id = self.feed.pk def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = dict(new=0, updated=0, same=0, error=0) if hasattr(self.fpf, 'status'): if self.options['verbose']: if self.fpf.bozo and self.fpf.status != 304: logging.debug( u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (self.feed.log_title[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values # 302 and 307: Temporary redirect: ignore # 301 and 308: Permanent redirect: save it (after 10 tries) if self.fpf.status == 301 or self.fpf.status == 308: if self.fpf.href.endswith('feedburner.com/atom.xml'): return FEED_ERRHTTP, ret_values redirects, non_redirects = self.feed.count_redirects_in_history( 'feed') self.feed.save_feed_history( self.fpf.status, "HTTP Redirect (%d to go)" % (10 - len(redirects))) if len(redirects) >= 10 or len(non_redirects) == 0: address = self.fpf.href if self.options['force'] and address: address = qurl(address, remove=['_']) self.feed.feed_address = address if not self.feed.known_good: self.feed.fetched_once = True logging.debug( " ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.log_title[:30], self.fpf.status)) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.log_title[:30], self.fpf.status)) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed = feed self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf: logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!" % (self.feed.log_title[:30])) self.feed.save_feed_history(551, "Broken feed") return FEED_ERRHTTP, ret_values if self.fpf and not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance( self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(553, 'Not an RSS feed', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields original_etag = self.feed.etag self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' if self.feed.etag != original_etag: self.feed.save(update_fields=['etag']) original_last_modified = self.feed.last_modified if hasattr(self.fpf, 'modified') and self.fpf.modified: try: self.feed.last_modified = datetime.datetime.strptime( self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z') except Exception, e: self.feed.last_modified = None logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e)) pass if self.feed.last_modified != original_last_modified: self.feed.save(update_fields=['last_modified']) self.fpf.entries = self.fpf.entries[:100] original_title = self.feed.feed_title if self.fpf.feed.get('title'): self.feed.feed_title = strip_tags(self.fpf.feed.get('title')) if self.feed.feed_title != original_title: self.feed.save(update_fields=['feed_title']) tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) if tagline: original_tagline = self.feed.data.feed_tagline self.feed.data.feed_tagline = smart_unicode(tagline) if self.feed.data.feed_tagline != original_tagline: self.feed.data.save(update_fields=['feed_tagline']) if not self.feed.feed_link_locked: new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get( 'id') or self.feed.feed_link if self.options['force'] and new_feed_link: new_feed_link = qurl(new_feed_link, remove=['_']) if new_feed_link != self.feed.feed_link: logging.debug( " ---> [%-30s] ~SB~FRFeed's page is different: %s to %s" % (self.feed.log_title[:30], self.feed.feed_link, new_feed_link)) redirects, non_redirects = self.feed.count_redirects_in_history( 'page') self.feed.save_page_history( 301, "HTTP Redirect (%s to go)" % (10 - len(redirects))) if len(redirects) >= 10 or len(non_redirects) == 0: self.feed.feed_link = new_feed_link self.feed.save(update_fields=['feed_link']) # Determine if stories aren't valid and replace broken guids guids_seen = set() permalinks_seen = set() for entry in self.fpf.entries: guids_seen.add(entry.get('guid')) permalinks_seen.add(Feed.get_permalink(entry)) guid_difference = len(guids_seen) != len(self.fpf.entries) single_guid = len(guids_seen) == 1 replace_guids = single_guid and guid_difference permalink_difference = len(permalinks_seen) != len(self.fpf.entries) single_permalink = len(permalinks_seen) == 1 replace_permalinks = single_permalink and permalink_difference # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() story_hashes = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry, self.fpf.encoding) if story.get('published') < start_date: start_date = story.get('published') if replace_guids: if replace_permalinks: new_story_guid = unicode(story.get('published')) if self.options['verbose']: logging.debug( u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (self.feed.log_title[:30], story.get('guid'), new_story_guid)) story['guid'] = new_story_guid else: new_story_guid = Feed.get_permalink(story) if self.options['verbose']: logging.debug( u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (self.feed.log_title[:30], story.get('guid'), new_story_guid)) story['guid'] = new_story_guid story['story_hash'] = MStory.feed_guid_hash_unsaved( self.feed.pk, story.get('guid')) stories.append(story) story_hashes.append(story.get('story_hash')) original_story_hash_count = len(story_hashes) story_hashes_in_unread_cutoff = self.feed.story_hashes_in_unread_cutoff[: original_story_hash_count] story_hashes.extend(story_hashes_in_unread_cutoff) story_hashes = list(set(story_hashes)) if self.options['verbose'] or settings.DEBUG: logging.debug( u' ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db' % (self.feed.log_title[:30], original_story_hash_count, len(story_hashes) - original_story_hash_count, len(story_hashes_in_unread_cutoff))) existing_stories = dict((s.story_hash, s) for s in MStory.objects( story_hash__in=story_hashes, # story_date__gte=start_date, # story_feed_id=self.feed.pk )) # if len(existing_stories) == 0: # existing_stories = dict((s.story_hash, s) for s in MStory.objects( # story_date__gte=start_date, # story_feed_id=self.feed.pk # )) ret_values = self.feed.add_update_stories( stories, existing_stories, verbose=self.options['verbose'], updates_off=self.options['updates_off']) # PubSubHubbub if (hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links') and self.fpf.feed.links): hub_url = None self_url = self.feed.feed_address for link in self.fpf.feed.links: if link['rel'] == 'hub' and not hub_url: hub_url = link['href'] elif link['rel'] == 'self': self_url = link['href'] push_expired = False if self.feed.is_push: try: push_expired = self.feed.push.lease_expires < datetime.datetime.now( ) except PushSubscription.DoesNotExist: self.feed.is_push = False if (hub_url and self_url and not settings.DEBUG and self.feed.active_subscribers > 0 and (push_expired or not self.feed.is_push or self.options.get('force'))): logging.debug( u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (self.feed.log_title[:30], "~SKRe-~SN" if push_expired else "", hub_url)) try: PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) except TimeoutError: logging.debug( u' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (self.feed.log_title[:30], hub_url)) elif (self.feed.is_push and (self.feed.active_subscribers <= 0 or not hub_url)): logging.debug( u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (self.feed.log_title[:30])) self.feed.is_push = False self.feed = self.feed.save() # Push notifications if ret_values['new'] > 0 and MUserFeedNotification.feed_has_users( self.feed.pk) > 0: QueueNotifications.delay(self.feed.pk, ret_values['new']) # All Done logging.debug( u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (self.feed.log_title[:30], '~FG~SB' if ret_values['new'] else '', ret_values['new'], '~FY~SB' if ret_values['updated'] else '', ret_values['updated'], '~SB' if ret_values['same'] else '', ret_values['same'], '~FR~SB' if ret_values['error'] else '', ret_values['error'], len(self.fpf.entries))) self.feed.update_all_statistics(has_new_stories=bool( ret_values['new']), force=self.options['force']) fetch_date = datetime.datetime.now() if ret_values['new']: if not getattr(settings, 'TEST_DEBUG', False): self.feed.trim_feed() self.feed.expire_redis() if MStatistics.get('raw_feed', None) == self.feed.pk: self.feed.save_raw_feed(self.raw_feed, fetch_date) self.feed.save_feed_history(200, "OK", date=fetch_date) if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (self.feed.log_title[:30], time.time() - start)) return FEED_OK, ret_values
def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = dict(new=0, updated=0, same=0, error=0) # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) if hasattr(self.fpf, 'status'): if self.options['verbose']: if self.fpf.bozo and self.fpf.status != 304: logging.debug( u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values # 302: Temporary redirect: ignore # 301: Permanent redirect: save it if self.fpf.status == 301: if not self.fpf.href.endswith('feedburner.com/atom.xml'): self.feed.feed_address = self.fpf.href if not self.feed.known_good: self.feed.fetched_once = True logging.debug( " ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status)) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status)) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed = feed self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance( self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address( ) if not fixed_feed: self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: self.feed.last_modified = None pass self.fpf.entries = self.fpf.entries[:100] if self.fpf.feed.get('title'): self.feed.feed_title = strip_tags(self.fpf.feed.get('title')) tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) if tagline: self.feed.data.feed_tagline = utf8encode(tagline) self.feed.data.save() if not self.feed.feed_link_locked: self.feed.feed_link = self.fpf.feed.get( 'link') or self.fpf.feed.get('id') or self.feed.feed_link self.feed = self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() story_guids = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') stories.append(story) story_guids.append(story.get('guid')) existing_stories = dict((s.story_guid, s) for s in MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed.pk).limit( max(int(len(story_guids) * 1.5), 10))) ret_values = self.feed.add_update_stories( stories, existing_stories, verbose=self.options['verbose']) if (hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links') and self.fpf.feed.links): hub_url = None self_url = self.feed.feed_address for link in self.fpf.feed.links: if link['rel'] == 'hub' and not hub_url: hub_url = link['href'] elif link['rel'] == 'self': self_url = link['href'] push_expired = False if self.feed.is_push: try: push_expired = self.feed.push.lease_expires < datetime.datetime.now( ) except PushSubscription.DoesNotExist: self.feed.is_push = False if (hub_url and self_url and not settings.DEBUG and self.feed.active_subscribers > 0 and (push_expired or not self.feed.is_push or self.options.get('force'))): logging.debug( u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (self.feed.title[:30], "~SKRe-~SN" if push_expired else "", hub_url)) try: PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) except TimeoutError: logging.debug( u' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (self.feed.title[:30], hub_url)) elif (self.feed.is_push and (self.feed.active_subscribers <= 0 or not hub_url)): logging.debug( u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (self.feed.title[:30])) self.feed.is_push = False self.feed = self.feed.save() logging.debug( u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (self.feed.title[:30], '~FG~SB' if ret_values['new'] else '', ret_values['new'], '~FY~SB' if ret_values['updated'] else '', ret_values['updated'], '~SB' if ret_values['same'] else '', ret_values['same'], '~FR~SB' if ret_values['error'] else '', ret_values['error'], len(self.fpf.entries))) self.feed.update_all_statistics(full=bool(ret_values['new']), force=self.options['force']) if ret_values['new']: self.feed.trim_feed() self.feed.expire_redis() self.feed.save_feed_history(200, "OK") if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (self.feed.title[:30], time.time() - start)) return FEED_OK, ret_values
def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = { ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0 } # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) if hasattr(self.fpf, 'status'): if self.options['verbose']: if self.fpf.bozo and self.fpf.status != 304: logging.debug( u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values if self.fpf.status in (302, 301): if not self.fpf.href.endswith('feedburner.com/atom.xml'): self.feed.feed_address = self.fpf.href if not self.feed.known_good: self.feed.fetched_once = True logging.debug( " ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status)) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status)) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance( self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception) self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: pass self.fpf.entries = self.fpf.entries[:50] if self.fpf.feed.get('title'): self.feed.feed_title = self.fpf.feed.get('title') tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) if tagline: self.feed.data.feed_tagline = utf8encode(tagline) self.feed.data.save() if not self.feed.feed_link_locked: self.feed.feed_link = self.fpf.feed.get( 'link') or self.fpf.feed.get('id') or self.feed.feed_link guids = [] for entry in self.fpf.entries: if entry.get('id', ''): guids.append(entry.get('id', '')) elif entry.get('link'): guids.append(entry.link) elif entry.get('title'): guids.append(entry.title) self.feed = self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() # end_date = datetime.datetime.utcnow() story_guids = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') # if story.get('published') > end_date: # end_date = story.get('published') stories.append(story) story_guids.append(story.get('guid') or story.get('link')) existing_stories = list( MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed_id).limit( min(int(len(story_guids) * 1.5), 10))) # MStory.objects( # (Q(story_date__gte=start_date) & Q(story_date__lte=end_date)) # | (Q(story_guid__in=story_guids)), # story_feed=self.feed # ).order_by('-story_date') ret_values = self.feed.add_update_stories( stories, existing_stories, verbose=self.options['verbose']) if ((not self.feed.is_push or self.options.get('force')) and hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links') and self.fpf.feed.links): hub_url = None self_url = self.feed.feed_address for link in self.fpf.feed.links: if link['rel'] == 'hub': hub_url = link['href'] elif link['rel'] == 'self': self_url = link['href'] if hub_url and self_url and not settings.DEBUG: logging.debug( u' ---> [%-30s] ~BB~FWSubscribing to PuSH hub: %s' % (self.feed.title[:30], hub_url)) PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) logging.debug( u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (self.feed.title[:30], '~FG~SB' if ret_values[ENTRY_NEW] else '', ret_values[ENTRY_NEW], '~FY~SB' if ret_values[ENTRY_UPDATED] else '', ret_values[ENTRY_UPDATED], '~SB' if ret_values[ENTRY_SAME] else '', ret_values[ENTRY_SAME], '~FR~SB' if ret_values[ENTRY_ERR] else '', ret_values[ENTRY_ERR], len(self.fpf.entries))) self.feed.update_all_statistics(full=bool(ret_values[ENTRY_NEW]), force=self.options['force']) self.feed.trim_feed() self.feed.save_feed_history(200, "OK") if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (self.feed.title[:30], time.time() - start)) return FEED_OK, ret_values
def test_train(self): user = User.objects.all() feed = Feed.objects.all() management.call_command('loaddata', 'brownstoner.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=1, single_threaded=True, daemonize=False) management.call_command('loaddata', 'brownstoner2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=1, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=1)[:53] phrasefilter = PhraseFilter() for story in stories: # print story.story_title, story.id phrasefilter.run(story.story_title, story.id) phrasefilter.pare_phrases() phrases = phrasefilter.get_phrases() print phrases tokenizer = Tokenizer(phrases) classifier = Bayes( tokenizer) # FisherClassifier(user[0], feed[0], phrases) classifier.train('good', 'House of the Day: 393 Pacific St.') classifier.train('good', 'House of the Day: 393 Pacific St.') classifier.train('good', 'Condo of the Day: 393 Pacific St.') classifier.train('good', 'Co-op of the Day: 393 Pacific St. #3') classifier.train('good', 'Co-op of the Day: 393 Pacific St. #3') classifier.train('good', 'Development Watch: 393 Pacific St. #3') classifier.train('bad', 'Development Watch: 393 Pacific St. #3') classifier.train('bad', 'Development Watch: 393 Pacific St. #3') classifier.train('bad', 'Development Watch: 393 Pacific St. #3') classifier.train('bad', 'Streetlevel: 393 Pacific St. #3') guess = dict(classifier.guess('Co-op of the Day: 413 Atlantic')) self.assertTrue(guess['good'] > .99) self.assertTrue('bad' not in guess) guess = dict(classifier.guess('House of the Day: 413 Atlantic')) self.assertTrue(guess['good'] > .99) self.assertTrue('bad' not in guess) guess = dict(classifier.guess('Development Watch: Yatta')) self.assertTrue(guess['bad'] > .7) self.assertTrue(guess['good'] < .3) guess = dict(classifier.guess('Development Watch: 393 Pacific St.')) self.assertTrue(guess['bad'] > .7) self.assertTrue(guess['good'] < .3) guess = dict(classifier.guess('Streetlevel: 123 Carlton St.')) self.assertTrue(guess['bad'] > .99) self.assertTrue('good' not in guess) guess = classifier.guess('Extra, Extra') self.assertTrue('bad' not in guess) self.assertTrue('good' not in guess) guess = classifier.guess('Nothing doing: 393 Pacific St.') self.assertTrue('bad' not in guess) self.assertTrue('good' not in guess)
def test_load_feeds__slashdot(self): self.client.login(username='******', password='******') old_story_guid = "tag:google.com,2005:reader/item/4528442633bc7b2b" management.call_command('loaddata', 'slashdot1.json', verbosity=0, commit=False) feed = Feed.objects.get(feed_link__contains='slashdot') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) response = self.client.get(reverse('load-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 38) self.client.post(reverse('mark-story-as-read'), { 'story_id': old_story_guid, 'feed_id': 5 }) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 37) management.call_command('loaddata', 'slashdot2.json', verbosity=0, commit=False) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) url = reverse('load-single-feed', kwargs=dict(feed_id=5)) response = self.client.get(url) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 6) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 37)
def receive_newsletter(self, params): user = self.user_from_email(params['recipient']) if not user: return sender_name, sender_username, sender_domain = self.split_sender( params['from']) feed_address = self.feed_address( user, "%s@%s" % (sender_username, sender_domain)) usf = UserSubscriptionFolders.objects.get(user=user) usf.add_folder('', 'Newsletters') try: feed = Feed.objects.get(feed_address=feed_address) except Feed.DoesNotExist: feed = Feed.objects.create(feed_address=feed_address, feed_link='http://' + sender_domain, feed_title=sender_name, fetched_once=True, known_good=True) feed.update() logging.user(user, "~FCCreating newsletter feed: ~SB%s" % (feed)) r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL) r.publish(user.username, 'reload:%s' % feed.pk) if feed.feed_title != sender_name: feed.feed_title = sender_name feed.save() try: usersub = UserSubscription.objects.get(user=user, feed=feed) except UserSubscription.DoesNotExist: _, _, usersub = UserSubscription.add_subscription( user=user, feed_address=feed_address, folder='Newsletters') story_hash = MStory.ensure_story_hash(params['signature'], feed.pk) story_content = self.get_content(params) story_content = self.clean_content(story_content) story_params = { "story_feed_id": feed.pk, "story_date": datetime.datetime.fromtimestamp(int(params['timestamp'])), "story_title": params['subject'], "story_content": story_content, "story_author_name": params['from'], "story_permalink": "https://%s%s" % (Site.objects.get_current().domain, reverse('newsletter-story', kwargs={'story_hash': story_hash})), "story_guid": params['signature'], } try: story = MStory.objects.get(story_hash=story_hash) except MStory.DoesNotExist: story = MStory(**story_params) story.save() usersub.needs_unread_recalc = True usersub.save() self.publish_to_subscribers(feed) MFetchHistory.add(feed_id=feed.pk, fetch_type='push') logging.user( user, "~FCNewsletter feed story: ~SB%s~SN / ~SB%s" % (story.story_title, feed)) return story