def reconfigure_duplicate_cluster(original, cluster_outside): # when a finding is deleted, and is an original of a duplicate cluster, we have to chose a new original for the cluster # only look for a new original if there is one outside this test if original is None or cluster_outside is None or len( cluster_outside) == 0: return if settings.DUPLICATE_CLUSTER_CASCADE_DELETE: cluster_outside.order_by('-id').delete() else: logger.debug('reconfigure_duplicate_cluster: cluster_outside: %s', cluster_outside) # set new original to first finding in cluster (ordered by id) new_original = cluster_outside.order_by('id').first() if new_original: logger.debug('changing original of duplicate cluster %d to: %s:%s', original.id, new_original.id, new_original.title) new_original.duplicate = False new_original.duplicate_finding = None new_original.active = True new_original.save_no_options() new_original.found_by.set(original.found_by.all()) # if the cluster is size 1, there's only the new original left if new_original and len(cluster_outside) > 1: # for find in cluster_outside: # if find != new_original: # find.duplicate_finding = new_original # find.save_no_options() mass_model_updater(Finding, cluster_outside, lambda f: set_new_original(f, new_original), fields=['duplicate_finding'])
def npm_censor_hashes(apps, schema_editor): # We can't import the models directly as it may be a newer # version than this migration expects. We use the historical version. logger.info('Removing random hashes from npm audit file_paths') now = timezone.now() Finding = apps.get_model('dojo', 'Finding') Test_Type = apps.get_model('dojo', 'Test_Type') npm_audit, _ = Test_Type.objects.get_or_create(name='NPM Audit Scan') findings = Finding.objects.filter(test__test_type=npm_audit) mass_model_updater(Finding, findings, lambda f: censor_hashes(f), fields=['file_path', 'hash_code'])
def handle(self, *args, **options): restrict_to_parsers = options['parser'] hash_code_only = options['hash_code_only'] dedupe_only = options['dedupe_only'] dedupe_sync = options['dedupe_sync'] if restrict_to_parsers is not None: findings = Finding.objects.filter( test__test_type__name__in=restrict_to_parsers) logger.info( "######## Will process only parsers %s and %d findings ########", *restrict_to_parsers, findings.count()) else: # add filter on id to make counts not slow on mysql findings = Finding.objects.all().filter(id__gt=0) logger.info( "######## Will process the full database with %d findings ########", findings.count()) # Phase 1: update hash_codes without deduplicating if not dedupe_only: logger.info( "######## Start Updating Hashcodes (foreground) ########") # only prefetch here for hash_code calculation finds = findings.prefetch_related('endpoints', 'test__test_type') mass_model_updater(Finding, finds, lambda f: generate_hash_code(f), fields=['hash_code'], order='asc', log_prefix='hash_code computation ') logger.info("######## Done Updating Hashcodes########") # Phase 2: deduplicate synchronously if not hash_code_only: if get_system_setting('enable_deduplication'): logger.info("######## Start deduplicating (%s) ########", ('foreground' if dedupe_sync else 'background')) if dedupe_sync: mass_model_updater(Finding, findings, lambda f: do_dedupe_finding(f), fields=None, order='desc', page_size=100, log_prefix='deduplicating ') else: # async tasks only need the id mass_model_updater(Finding, findings.only('id'), lambda f: do_dedupe_finding_task(f.id), fields=None, order='desc', log_prefix='deduplicating ') # update the grading (if enabled) logger.debug('Updating grades for products...') for product in Product.objects.all(): calculate_grade(product) logger.info("######## Done deduplicating (%s) ########", ('foreground' if dedupe_sync else 'tasks submitted to celery')) else: logger.debug( "skipping dedupe because it's disabled in system settings")
def reset_duplicates_before_delete(qs): mass_model_updater(Finding, qs, lambda f: reset_duplicate_before_delete(f), fields=['duplicate', 'duplicate_finding'])