def _create_problems(self, db, problemplugin, report_min_count=0, speedup=False): if speedup: db_reports = get_reports_for_problems(db, problemplugin.name) db_reports += get_unassigned_reports(db, problemplugin.name, min_count=report_min_count) else: db_reports = get_reports_by_type(db, problemplugin.name, min_count=report_min_count) db_problems = get_problems(db) # dict to get db_problem by problem_id self.log_debug("Creating problem reuse dict") problems_dict = {} for db_problem in db_problems: problems_dict[db_problem.id] = db_problem # dict to get report_ids by problem_id problem_report = defaultdict(list) for db_report in db_reports: if db_report.problem_id is not None: problem_report[db_report.problem_id].append(db_report.id) # create lookup dict for problems reuse_problems = {} for (problem_id, report_ids) in problem_report.items(): reuse_problems[tuple(sorted(report_ids))] = problem_id invalid_report_ids_to_clean = [] problems = [] if not db_reports: self.log_info("No reports found") elif len(db_reports) == 1: db_report = db_reports[0] if db_report.problem is None: problems.append([db_report]) else: report_map = {} _satyr_reports = [] i = 0 for db_report in db_reports: i += 1 self.log_debug("[{0} / {1}] Loading report #{2}" .format(i, len(db_reports), db_report.id)) _satyr_report = problemplugin._db_report_to_satyr(db_report) if _satyr_report is None: self.log_debug("Unable to create satyr report") if db_report.problem_id is not None: invalid_report_ids_to_clean.append(db_report.id) else: _satyr_reports.append(_satyr_report) report_map[_satyr_report] = db_report db.session.expire(db_report) self.log_debug("Clustering") clusters = self._create_clusters(_satyr_reports, 2000) # Threads that share no function with another thread unique_func_threads = set(_satyr_reports) - set().union(*clusters) dendrograms = [] i = 0 for cluster in clusters: i += 1 self.log_debug("[{0} / {1}] Computing distances" .format(i, len(clusters))) distances = satyr.Distances(cluster, len(cluster)) self.log_debug("Getting dendrogram") dendrograms.append(satyr.Dendrogram(distances)) for dendrogram, cluster in zip(dendrograms, clusters): problem = [] for dups in dendrogram.cut(0.3, 1): reports = set(report_map[cluster[dup]] for dup in dups) problem.append(reports) problems.extend(problem) # Unique threads form their own unique problems for thread in unique_func_threads: problems.append({report_map[thread]}) self.log_info("Creating problems from clusters") if speedup: for problem in problems: if not problem: continue first_report = next(iter(problem)) if len(problem) > 1: # Find assigned report origin_report = None for db_report in problem: if db_report.problem_id: origin_report = db_report # Problem created only from new reports comps = {} if not origin_report: new = Problem() db.session.add(new) db.session.flush() first_occurrence = first_report.first_occurrence last_occurrence = first_report.last_occurrence for rep in problem: rep.problem_id = new.id if first_occurrence > rep.first_occurrence: first_occurrence = rep.first_occurrence if last_occurrence < rep.last_occurrence: last_occurrence = rep.last_occurrence if rep.component not in comps: comps[rep.component] = 0 comps[rep.component] += 1 self.update_comps(db, comps, new) new.last_occurrence = last_occurrence new.first_occurrence = first_occurrence else: first_occurrence = origin_report.first_occurrence last_occurrence = origin_report.last_occurrence for rep in problem: if not rep.problem_id: rep.problem_id = origin_report.problem_id if first_occurrence > rep.first_occurrence: first_occurrence = rep.first_occurrence if last_occurrence < rep.last_occurrence: last_occurrence = rep.last_occurrence if rep.component not in comps: comps[rep.component] = 0 comps[rep.component] += 1 orig_p = get_problem_by_id(db, origin_report.problem_id) self.update_comps(db, comps, orig_p) orig_p.last_occurrence = last_occurrence orig_p.first_occurrence = first_occurrence else: # The report is assigned if first_report.problem_id: continue else: # One report that wasn't matched with anything else new = Problem() new.first_occurrence = first_report.first_occurrence new.last_occurrence = first_report.last_occurrence db.session.add(new) db.session.flush() self.update_comps(db, {first_report.component: 1}, new) first_report.problem_id = new.id db.session.flush() else: for problem, db_problem, reports_changed in self._iter_problems( db, problems, db_problems, problems_dict, reuse_problems): comps = {} problem_last_occurrence = None problem_first_occurrence = None for db_report in problem: db_report.problem = db_problem if (problem_last_occurrence is None or problem_last_occurrence < db_report.last_occurrence): problem_last_occurrence = db_report.last_occurrence if (problem_first_occurrence is None or problem_first_occurrence > db_report.first_occurrence): problem_first_occurrence = db_report.first_occurrence if db_report.component not in comps: comps[db_report.component] = 0 comps[db_report.component] += 1 # In case nothing changed, we don't want to mark db_problem # dirty which would cause another UPDATE if db_problem.first_occurrence != problem_first_occurrence: db_problem.first_occurrence = problem_first_occurrence if db_problem.last_occurrence != problem_last_occurrence: db_problem.last_occurrence = problem_last_occurrence if reports_changed: self.update_comps(db, comps, db_problem) self.log_debug("Removing {0} invalid reports from problems" .format(len(invalid_report_ids_to_clean))) for report_id in invalid_report_ids_to_clean: db_report = get_report_by_id(db, report_id) if db_report is not None: db_report.problem_id = None db.session.add(db_report) if report_min_count > 0: self.log_debug("Removing problems from low count reports") remove_problem_from_low_count_reports_by_type(db, problemplugin.name, min_count=report_min_count) self.log_debug("Flushing session") db.session.flush()
def _create_problems(self, db, problemplugin): db_reports = get_reports_by_type(db, problemplugin.name) db_problems = get_problems(db) # dict to get db_problem by problem_id self.log_debug("Creating problem reuse dict") problems_dict = {} for db_problem in db_problems: problems_dict[db_problem.id] = db_problem # dict to get report_ids by problem_id problem_report = defaultdict(list) for db_report in db_reports: if db_report.problem_id is not None: problem_report[db_report.problem_id].append(db_report.id) # create lookup dict for problems reuse_problems = {} for (problem_id, report_ids) in problem_report.items(): reuse_problems[tuple(sorted(report_ids))] = problem_id invalid_report_ids_to_clean = [] problems = [] if len(db_reports) < 1: self.log_info("No reports found") elif len(db_reports) == 1: db_report = db_reports[0] if db_report.problem is None: problems.append([db_report]) else: report_map = {} _satyr_reports = [] i = 0 for db_report in db_reports: i += 1 self.log_debug("[{0} / {1}] Loading report #{2}" .format(i, len(db_reports), db_report.id)) _satyr_report = problemplugin._db_report_to_satyr(db_report) if _satyr_report is None: self.log_debug("Unable to create satyr report") if db_report.problem_id is not None: invalid_report_ids_to_clean.append(db_report.id) else: _satyr_reports.append(_satyr_report) report_map[_satyr_report] = db_report db.session.expire(db_report) self.log_debug("Clustering") clusters = self._create_clusters(_satyr_reports, 2000) unique_func_threads = set(_satyr_reports) - set().union(*clusters) dendrograms = [] i = 0 for cluster in clusters: i += 1 self.log_debug("[{0} / {1}] Computing distances" .format(i, len(clusters))) distances = satyr.Distances(cluster, len(cluster)) self.log_debug("Getting dendrogram") dendrograms.append(satyr.Dendrogram(distances)) for dendrogram, cluster in zip(dendrograms, clusters): problem = [] for dups in dendrogram.cut(0.3, 1): reports = set(report_map[cluster[dup]] for dup in dups) problem.append(reports) problems.extend(problem) for thread in unique_func_threads: problems.append(set([report_map[thread]])) self.log_info("Creating problems from clusters") for problem, db_problem, reports_changed in self._iter_problems( db, problems, db_problems, problems_dict, reuse_problems): comps = {} problem_last_occurrence = None problem_first_occurrence = None for db_report in problem: db_report.problem = db_problem if (problem_last_occurrence is None or problem_last_occurrence < db_report.last_occurrence): problem_last_occurrence = db_report.last_occurrence if (problem_first_occurrence is None or problem_first_occurrence > db_report.first_occurrence): problem_first_occurrence = db_report.first_occurrence if db_report.component not in comps: comps[db_report.component] = 0 comps[db_report.component] += 1 # In case nothing changed, we don't want to mark db_problem dirty # which would cause another UPDATE if db_problem.first_occurrence != problem_first_occurrence: db_problem.first_occurrence = problem_first_occurrence if db_problem.last_occurrence != problem_last_occurrence: db_problem.last_occurrence = problem_last_occurrence if reports_changed: db_comps = sorted(comps, key=lambda x: comps[x], reverse=True) order = 0 for db_component in db_comps: order += 1 db_pcomp = get_problem_component(db, db_problem, db_component) if db_pcomp is None: db_pcomp = ProblemComponent() db_pcomp.problem = db_problem db_pcomp.component = db_component db_pcomp.order = order db.session.add(db_pcomp) self.log_debug("Removing {0} invalid reports from problems" .format(len(invalid_report_ids_to_clean))) for report_id in invalid_report_ids_to_clean: db_report = get_report_by_id(db, report_id) if db_report is not None: db_report.problem_id = None db.session.add(db_report) self.log_debug("Flushing session") db.session.flush()