def test_look_for_workcluster_deduplication_task(self, Lock, _): lock = Lock.return_value lock.__enter__ = Mock(return_value=None) lock.__exit__ = Mock(return_value=None) works = Work.objects.filter(id__in=self.work_ids) # Create duplicates WorkClusters on purpose. for _ in range(5): create_work_cluster(works, perform_union=False) self.assertEqual(WorkCluster.objects.count(), 5) tasks.look_for_workclusters() # All duplicates have been reduced to one WorkCluster. self.assertEqual(WorkCluster.objects.count(), 1) self.assertEqual(lock.__enter__.call_count, 1)
def test_merge_work_clusters(self): works = Work.objects.filter(id__in=self.work_ids) clusters = [] for _ in range(5): clusters.append(create_work_cluster(works, perform_union=False)) self.assertEqual(WorkCluster.objects.count(), 5) merge_work_clusters(*clusters) self.assertEqual(WorkCluster.objects.count(), 1)
def look_for_workclusters(steal_workcluster: bool = False): """ A maintenance Celery Task which clusters works in the database, creating WorkCluster objects. Args: steal_workcluster (bool): Allow for this task to merge non-automatic WorkClusters with automatic ones. (i.e. if a WorkCluster is deemed to be the same but its user is human, we would steal or not its WorkCluster to merge it with a new one). Returns: None. """ logger.info('Looking for easy WorkCluster to create...') with redis_lock.Lock(redis.StrictRedis(connection_pool=redis_pool), 'lock-wc-lookout', expire=DEFAULT_LOCK_EXPIRATION_TIME): logger.info('Acquired Redis lock.') # MAL-created duplicates duplicates = Work.objects.values('title', 'category_id').annotate( Count('id')).filter(id__count__gte=2) for dupe in duplicates.iterator(): works = Work.objects.filter( title=dupe['title']).prefetch_related('workcluster_set') cluster = create_work_cluster(works) logger.info('Clustered {} works. ({})'.format( len(works), cluster.id)) logger.info('Clustering done.') logger.info('Compressing redundant work clusters.') for work in Work.objects.prefetch_related( 'workcluster_set').iterator(): # Only merge automatic unprocessed work clusters. cluster_filter = Q(status='unprocessed') if not steal_workcluster: # Don't be evil. Don't steal human WorkClusters. cluster_filter |= Q(user=None) clusters = work.workcluster_set.filter(cluster_filter).order_by( 'id').all() if len(clusters) > 1: merge_work_clusters(*clusters) logger.info('{} clusters merged.'.format(len(clusters))) logger.info('Compression done.')
def test_create_work_clusters_with_union(self): works = Work.objects.filter(id__in=self.work_ids) for _ in range(5): create_work_cluster(works, perform_union=True) self.assertEqual(WorkCluster.objects.count(), 1)