def test_merge_work_clusters(self): works = Work.objects.filter(id__in=self.work_ids) clusters = [] for _ in range(5): clusters.append(create_work_cluster(works, perform_union=False)) self.assertEqual(WorkCluster.objects.count(), 5) merge_work_clusters(*clusters) self.assertEqual(WorkCluster.objects.count(), 1)
def look_for_workclusters(steal_workcluster: bool = False): """ A maintenance Celery Task which clusters works in the database, creating WorkCluster objects. Args: steal_workcluster (bool): Allow for this task to merge non-automatic WorkClusters with automatic ones. (i.e. if a WorkCluster is deemed to be the same but its user is human, we would steal or not its WorkCluster to merge it with a new one). Returns: None. """ logger.info('Looking for easy WorkCluster to create...') with redis_lock.Lock(redis.StrictRedis(connection_pool=redis_pool), 'lock-wc-lookout', expire=DEFAULT_LOCK_EXPIRATION_TIME): logger.info('Acquired Redis lock.') # MAL-created duplicates duplicates = Work.objects.values('title', 'category_id').annotate( Count('id')).filter(id__count__gte=2) for dupe in duplicates.iterator(): works = Work.objects.filter( title=dupe['title']).prefetch_related('workcluster_set') cluster = create_work_cluster(works) logger.info('Clustered {} works. ({})'.format( len(works), cluster.id)) logger.info('Clustering done.') logger.info('Compressing redundant work clusters.') for work in Work.objects.prefetch_related( 'workcluster_set').iterator(): # Only merge automatic unprocessed work clusters. cluster_filter = Q(status='unprocessed') if not steal_workcluster: # Don't be evil. Don't steal human WorkClusters. cluster_filter |= Q(user=None) clusters = work.workcluster_set.filter(cluster_filter).order_by( 'id').all() if len(clusters) > 1: merge_work_clusters(*clusters) logger.info('{} clusters merged.'.format(len(clusters))) logger.info('Compression done.')