def get_follower_groups(self, user, update_cache=False): ''' Gets the active and inactive follower groups together with their feed max length ''' from feedly.feeds.love_feed import INACTIVE_USER_MAX_LENGTH, ACTIVE_USER_MAX_LENGTH active_follower_ids = self.get_active_follower_ids( user, update_cache=update_cache) inactive_follower_ids = self.get_inactive_follower_ids( user, update_cache=update_cache) follower_ids = active_follower_ids + inactive_follower_ids active_follower_groups = list( chunks(active_follower_ids, self.FANOUT_CHUNK_SIZE)) active_follower_groups = [(follower_group, ACTIVE_USER_MAX_LENGTH) for follower_group in active_follower_groups] inactive_follower_groups = list( chunks(inactive_follower_ids, self.FANOUT_CHUNK_SIZE)) inactive_follower_groups = [(follower_group, INACTIVE_USER_MAX_LENGTH) for follower_group in inactive_follower_groups] follower_groups = active_follower_groups + inactive_follower_groups logger.info('divided %s fanouts into %s tasks', len( follower_ids), len(follower_groups)) return follower_groups
def _start_fanout(self, feed_classes, user_id, operation, follower_ids=None, *args, **kwargs): ''' Start fanout applies the given operation to the feeds of the users followers It takes the following ids and distributes them per fanout_chunk_size into smaller tasks :param feed_classes: the feed classes to run the operation on :param user_id: the user id to run the operation for :param operation: the operation function applied to all follower feeds :param follower_ids: (optionally) specify the list of followers :param args: args passed to the operation :param kwargs: kwargs passed to the operation ''' user_ids = follower_ids or self.get_user_follower_ids(user_id=user_id) user_ids_chunks = list(chunks(user_ids, self.fanout_chunk_size)) msg_format = 'spawning %s subtasks for %s user ids in chunks of %s users' logger.info(msg_format, len(user_ids_chunks), len(user_ids), self.fanout_chunk_size) # now actually create the tasks subs = [] for ids_chunk in user_ids_chunks: for name, feed_class in feed_classes.items(): feed_class_dict = dict() feed_class_dict[name] = feed_class task_args = [ self, feed_class_dict, ids_chunk, operation] + list(args) sub = fanout_operation.apply_async( args=task_args, kwargs=kwargs ) subs.append(sub) return subs
def create_fanout_tasks(self, follower_ids, feed_class, operation, operation_kwargs=None, fanout_priority=None): ''' Creates the fanout task for the given activities and feed classes followers It takes the following ids and distributes them per fanout_chunk_size into smaller tasks :param follower_ids: specify the list of followers :param feed_class: the feed classes to run the operation on :param operation: the operation function applied to all follower feeds :param operation_kwargs: kwargs passed to the operation :param fanout_priority: the priority set to this fanout ''' fanout_task = self.get_fanout_task( fanout_priority, feed_class=feed_class) if not fanout_task: return [] chunk_size = self.fanout_chunk_size user_ids_chunks = list(chunks(follower_ids, chunk_size)) msg_format = 'spawning %s subtasks for %s user ids in chunks of %s users' logger.info( msg_format, len(user_ids_chunks), len(follower_ids), chunk_size) tasks = [] # now actually create the tasks for ids_chunk in user_ids_chunks: task = fanout_task.delay( feed_manager=self, feed_class=feed_class, user_ids=ids_chunk, operation=operation, operation_kwargs=operation_kwargs ) tasks.append(task) return tasks
def batch_import(self, user_id, activities, fanout=True, chunk_size=500): ''' Batch import all of the users activities and distributes them to the users followers **Example**:: activities = [long list of activities] feedly.batch_import(13, activities, 500) :param user_id: the user who created the activities :param activities: a list of activities from this user :param fanout: if we should run the fanout or not :param chunk_size: per how many activities to run the batch operations ''' activities = list(activities) # skip empty lists if not activities: return logger.info('running batch import for user %s', user_id) # lookup the follower ids if we need them later follower_ids = [] if fanout: follower_ids = self.get_user_follower_ids(user_id=user_id) logger.info('retrieved %s follower ids', len(follower_ids)) user_feed = self.get_user_feed(user_id) if activities[0].actor_id != user_id: raise ValueError('Send activities for only one user please') activity_chunks = list(chunks(activities, chunk_size)) logger.info('processing %s items in %s chunks of %s', len(activities), len(activity_chunks), chunk_size) for index, activity_chunk in enumerate(activity_chunks): # first insert into the global activity storage self.user_feed_class.insert_activities(activity_chunk) logger.info( 'inserted chunk %s (length %s) into the global activity store', index, len(activity_chunk)) # next add the activities to the users personal timeline user_feed.add_many(activity_chunk, trim=False) logger.info( 'inserted chunk %s (length %s) into the user feed', index, len(activity_chunk)) # now start a big fanout task if fanout: logger.info('starting task fanout for chunk %s', index) # create the fanout tasks operation_kwargs = dict(activities=activity_chunk, trim=False) for feed_class in self.feed_classes.values(): self.create_fanout_tasks( follower_ids, feed_class, add_operation, operation_kwargs=operation_kwargs )
def _add_many(redis, score_value_pairs): score_value_list = sum(map(list, score_value_pairs), []) score_value_chunks = chunks(score_value_list, 200) for score_value_chunk in score_value_chunks: result = redis.zadd(key, *score_value_chunk) logger.debug("adding to %s with score_value_chunk %s", key, score_value_chunk) results.append(result) return results
def _add_many(redis, score_value_pairs): score_value_list = sum(map(list, score_value_pairs), []) score_value_chunks = chunks(score_value_list, 200) for score_value_chunk in score_value_chunks: result = redis.zadd(key, *score_value_chunk) logger.debug('adding to %s with score_value_chunk %s', key, score_value_chunk) results.append(result) return results
def add_to_storage(self, key, activities, batch_interface=None, *args, **kwargs): ''' Insert multiple columns using client.insert or batch_interface.insert ''' batch = batch_interface or BatchQuery() activity_chunks = chunks(activities.itervalues(), 50) for activity_chunk in activity_chunks: for model_instance in activity_chunk: model_instance.feed_id = str(key) model_instance.batch(batch).save() if batch_interface is None: batch.execute()
def batch_import(self, user_id, activities, chunk_size=500): ''' Batch import all of the users activities and distributes them to the users followers **Example**:: activities = [long list of activities] feedly.batch_import(13, activities, 500) :param user_id: the user who created the activities :param activities: a list of activities from this user :param chunk_size: per how many activities to run the batch operations ''' activities = list(activities) # skip empty lists if not activities: return logger.info('running batch import for user %s', user_id) follower_ids = self.get_user_follower_ids(user_id=user_id) logger.info('retrieved %s follower ids', len(follower_ids)) user_feed = self.get_user_feed(user_id) if activities[0].actor_id != user_id: raise ValueError('Send activities for only one user please') activity_chunks = list(chunks(activities, chunk_size)) logger.info('processing %s items in %s chunks of %s', len(activities), len(activity_chunks), chunk_size) for index, activity_chunk in enumerate(activity_chunks): # first insert into the global activity storage self.user_feed_class.insert_activities(activity_chunk) logger.info( 'inserted chunk %s (length %s) into the global activity store', index, len(activity_chunk)) # next add the activities to the users personal timeline user_feed.add_many(activity_chunk) logger.info('inserted chunk %s (length %s) into the user feed', index, len(activity_chunk)) # now start a big fanout task logger.info('starting task fanout for chunk %s', index) self._start_fanout( self.feed_classes, user_id, add_operation, follower_ids=follower_ids, activities=activity_chunk, # disable trimming during the import as its really really slow trim=False)
def batch_import(self, user_id, activities, chunk_size=500): ''' Batch import all of the users activities and distributes them to the users followers **Example**:: activities = [long list of activities] feedly.batch_import(13, activities, 500) :param user_id: the user who created the activities :param activities: a list of activities from this user :param chunk_size: per how many activities to run the batch operations ''' activities = list(activities) # skip empty lists if not activities: return logger.info('running batch import for user %s', user_id) follower_ids = self.get_user_follower_ids(user_id=user_id) logger.info('retrieved %s follower ids', len(follower_ids)) user_feed = self.get_user_feed(user_id) if activities[0].actor_id != user_id: raise ValueError('Send activities for only one user please') activity_chunks = list(chunks(activities, chunk_size)) logger.info('processing %s items in %s chunks of %s', len(activities), len(activity_chunks), chunk_size) for index, activity_chunk in enumerate(activity_chunks): # first insert into the global activity storage self.user_feed_class.insert_activities(activity_chunk) logger.info( 'inserted chunk %s (length %s) into the global activity store', index, len(activity_chunk)) # next add the activities to the users personal timeline user_feed.add_many(activity_chunk) logger.info( 'inserted chunk %s (length %s) into the user feed', index, len(activity_chunk)) # now start a big fanout task logger.info('starting task fanout for chunk %s', index) self._start_fanout( self.feed_classes, user_id, add_operation, follower_ids=follower_ids, activities=activity_chunk, # disable trimming during the import as its really really slow trim=False )
def test_remove_love(self): from entity.models import Love thessa = User.objects.get(pk=13) profile = thessa.get_profile() follower_ids = profile.cached_follower_ids()[:100] love = Love.objects.all()[:1][0] connection = get_redis_connection() # divide the followers in groups of 10000 follower_groups = chunks(follower_ids, 10000) for follower_group in follower_groups: # now, for these 10000 items pipeline/thread away with connection.map() as redis: activity = love.create_activity() for follower_id in follower_group: feed = LoveFeed(follower_id, redis=redis) feed.remove(activity)
def test_add_love(self): from entity.models import Love thessa = User.objects.get(pk=13) profile = thessa.get_profile() follower_ids = profile.cached_follower_ids()[:100] love = Love.objects.all()[:1][0] connection = get_redis_connection() # divide the followers in groups of 10000 follower_groups = chunks(follower_ids, 10000) for follower_group in follower_groups: # now, for these 10000 items pipeline/thread away with connection.map() as redis: activity = Activity(love.user, LoveVerb, love, love.user, time=love.created_at, extra_context=dict(hello='world')) for follower_id in follower_group: feed = LoveFeed(follower_id, redis=redis) feed.add(activity)
def _fanout(self, user, operation, *args, **kwargs): ''' Generic functionality for running an operation on all of your follower's feeds It takes the following ids and distributes them per FANOUT_CHUNKS ''' following_ids = self.get_follower_ids(user) following_groups = chunks(following_ids, self.FANOUT_CHUNK_SIZE) feeds = [] for following_group in following_groups: #now, for these items pipeline/thread away via an async task from feedly.tasks import fanout_love_feedly fanout_love_feedly.delay(self, user, following_group, operation, *args, **kwargs) #reset the feeds to get out of the distributed mode connection = get_redis_connection() for feed in feeds: feed.redis = connection return feeds
def _start_fanout(self, feed_classes, user_id, operation, follower_ids=None, *args, **kwargs): ''' Start fanout applies the given operation to the feeds of the users followers It takes the following ids and distributes them per fanout_chunk_size into smaller tasks :param feed_classes: the feed classes to run the operation on :param user_id: the user id to run the operation for :param operation: the operation function applied to all follower feeds :param follower_ids: (optionally) specify the list of followers :param args: args passed to the operation :param kwargs: kwargs passed to the operation ''' user_ids = follower_ids or self.get_user_follower_ids(user_id=user_id) user_ids_chunks = list(chunks(user_ids, self.fanout_chunk_size)) msg_format = 'spawning %s subtasks for %s user ids in chunks of %s users' logger.info(msg_format, len(user_ids_chunks), len(user_ids), self.fanout_chunk_size) # now actually create the tasks subs = [] for ids_chunk in user_ids_chunks: for name, feed_class in feed_classes.items(): feed_class_dict = dict() feed_class_dict[name] = feed_class task_args = [self, feed_class_dict, ids_chunk, operation ] + list(args) sub = fanout_operation.apply_async(args=task_args, kwargs=kwargs) subs.append(sub) return subs
def test_one_chunk(self): chunked = chunks(range(2), 5) chunked = list(chunked) self.assertEqual(chunked, [(0, 1)])
def test_chunks(self): chunked = chunks(range(6), 2) chunked = list(chunked) self.assertEqual(chunked, [(0, 1), (2, 3), (4, 5)])