Пример #1
0
    def test_create_problems_removes_empty_problems(self):
        """
        Test create problems removes problems without reports
        """

        p = Problem(first_occurrence=datetime.date.today(),
                    last_occurrence=datetime.date.today())

        self.db.session.add(p)
        self.db.session.flush()

        self.call_action("create-problems")

        self.assertEqual(self.db.session.query(Problem).count(), 0)
Пример #2
0
    def _create_problems(self, db, problemplugin,
                         report_min_count=0, speedup=False):
        if speedup:
            db_reports = get_reports_for_problems(db, problemplugin.name)
            db_reports += get_unassigned_reports(db, problemplugin.name,
                                                 min_count=report_min_count)
        else:
            db_reports = get_reports_by_type(db, problemplugin.name,
                                             min_count=report_min_count)
        db_problems = get_problems(db)

        # dict to get db_problem by problem_id
        self.log_debug("Creating problem reuse dict")
        problems_dict = {}
        for db_problem in db_problems:
            problems_dict[db_problem.id] = db_problem
        # dict to get report_ids by problem_id
        problem_report = defaultdict(list)
        for db_report in db_reports:
            if db_report.problem_id is not None:
                problem_report[db_report.problem_id].append(db_report.id)
        # create lookup dict for problems
        reuse_problems = {}
        for (problem_id, report_ids) in problem_report.items():
            reuse_problems[tuple(sorted(report_ids))] = problem_id

        invalid_report_ids_to_clean = []
        problems = []
        if not db_reports:
            self.log_info("No reports found")
        elif len(db_reports) == 1:
            db_report = db_reports[0]
            if db_report.problem is None:
                problems.append([db_report])
        else:
            report_map = {}
            _satyr_reports = []
            i = 0
            for db_report in db_reports:
                i += 1
                self.log_debug("[{0} / {1}] Loading report #{2}"
                               .format(i, len(db_reports), db_report.id))

                _satyr_report = problemplugin._db_report_to_satyr(db_report)
                if _satyr_report is None:
                    self.log_debug("Unable to create satyr report")
                    if db_report.problem_id is not None:
                        invalid_report_ids_to_clean.append(db_report.id)
                else:
                    _satyr_reports.append(_satyr_report)
                    report_map[_satyr_report] = db_report

                db.session.expire(db_report)

            self.log_debug("Clustering")
            clusters = self._create_clusters(_satyr_reports, 2000)
            # Threads that share no function with another thread
            unique_func_threads = set(_satyr_reports) - set().union(*clusters)

            dendrograms = []
            i = 0
            for cluster in clusters:
                i += 1
                self.log_debug("[{0} / {1}] Computing distances"
                               .format(i, len(clusters)))
                distances = satyr.Distances(cluster, len(cluster))

                self.log_debug("Getting dendrogram")
                dendrograms.append(satyr.Dendrogram(distances))

            for dendrogram, cluster in zip(dendrograms, clusters):
                problem = []
                for dups in dendrogram.cut(0.3, 1):
                    reports = set(report_map[cluster[dup]] for dup in dups)
                    problem.append(reports)

                problems.extend(problem)

            # Unique threads form their own unique problems
            for thread in unique_func_threads:
                problems.append({report_map[thread]})

        self.log_info("Creating problems from clusters")
        if speedup:
            for problem in problems:
                if not problem:
                    continue
                first_report = next(iter(problem))
                if len(problem) > 1:
                    # Find assigned report
                    origin_report = None
                    for db_report in problem:
                        if db_report.problem_id:
                            origin_report = db_report

                    # Problem created only from new reports
                    comps = {}
                    if not origin_report:
                        new = Problem()
                        db.session.add(new)
                        db.session.flush()
                        first_occurrence = first_report.first_occurrence
                        last_occurrence = first_report.last_occurrence
                        for rep in problem:
                            rep.problem_id = new.id

                            if first_occurrence > rep.first_occurrence:
                                first_occurrence = rep.first_occurrence
                            if last_occurrence < rep.last_occurrence:
                                last_occurrence = rep.last_occurrence

                            if rep.component not in comps:
                                comps[rep.component] = 0

                            comps[rep.component] += 1
                        self.update_comps(db, comps, new)
                        new.last_occurrence = last_occurrence
                        new.first_occurrence = first_occurrence

                    else:
                        first_occurrence = origin_report.first_occurrence
                        last_occurrence = origin_report.last_occurrence
                        for rep in problem:
                            if not rep.problem_id:
                                rep.problem_id = origin_report.problem_id

                                if first_occurrence > rep.first_occurrence:
                                    first_occurrence = rep.first_occurrence
                                if last_occurrence < rep.last_occurrence:
                                    last_occurrence = rep.last_occurrence

                                if rep.component not in comps:
                                    comps[rep.component] = 0

                                comps[rep.component] += 1
                        orig_p = get_problem_by_id(db, origin_report.problem_id)
                        self.update_comps(db, comps, orig_p)
                        orig_p.last_occurrence = last_occurrence
                        orig_p.first_occurrence = first_occurrence
                else:
                    # The report is assigned
                    if first_report.problem_id:
                        continue
                    else:
                        # One report that wasn't matched with anything else
                        new = Problem()
                        new.first_occurrence = first_report.first_occurrence
                        new.last_occurrence = first_report.last_occurrence
                        db.session.add(new)
                        db.session.flush()

                        self.update_comps(db, {first_report.component: 1}, new)
                        first_report.problem_id = new.id
            db.session.flush()

        else:
            for problem, db_problem, reports_changed in self._iter_problems(
                    db, problems, db_problems, problems_dict, reuse_problems):

                comps = {}

                problem_last_occurrence = None
                problem_first_occurrence = None
                for db_report in problem:
                    db_report.problem = db_problem

                    if (problem_last_occurrence is None or
                            problem_last_occurrence < db_report.last_occurrence):
                        problem_last_occurrence = db_report.last_occurrence

                    if (problem_first_occurrence is None or
                            problem_first_occurrence > db_report.first_occurrence):
                        problem_first_occurrence = db_report.first_occurrence

                    if db_report.component not in comps:
                        comps[db_report.component] = 0

                    comps[db_report.component] += 1

                # In case nothing changed, we don't want to mark db_problem
                # dirty which would cause another UPDATE
                if db_problem.first_occurrence != problem_first_occurrence:
                    db_problem.first_occurrence = problem_first_occurrence
                if db_problem.last_occurrence != problem_last_occurrence:
                    db_problem.last_occurrence = problem_last_occurrence

                if reports_changed:
                    self.update_comps(db, comps, db_problem)

            self.log_debug("Removing {0} invalid reports from problems"
                           .format(len(invalid_report_ids_to_clean)))
            for report_id in invalid_report_ids_to_clean:
                db_report = get_report_by_id(db, report_id)
                if db_report is not None:
                    db_report.problem_id = None
                    db.session.add(db_report)

            if report_min_count > 0:
                self.log_debug("Removing problems from low count reports")
                remove_problem_from_low_count_reports_by_type(db,
                                                              problemplugin.name,
                                                              min_count=report_min_count)

            self.log_debug("Flushing session")
            db.session.flush()
Пример #3
0
    def _iter_problems(self, db, problems, db_problems, problems_dict,
                       reuse_problems):
        """
        Yields (problem, db_problem, reports_changed) tuples.
        """
        # Three phases, see below

        # Counts for statistics
        i = 0
        lookedup_count = 0
        found_count = 0
        created_count = 0
        # List of problems left for the second phase
        second_pass = list()
        # List of possible matches for the second phase
        match_list = list()
        # Set of db_problems that were used in on of the phases. A db_problem
        # must be yielded at most once.
        db_problems_used = set()
        # Phase one: try to look up precise matches
        for problem in problems:
            i += 1

            self.log_debug("[{0} / {1}] Processing cluster"
                           .format(i, len(problems)))

            reports_changed = True
            problem_id = reuse_problems.get(
                tuple(sorted([db_report.id for db_report in problem])), None)
            if problem_id is not None:
                db_problem = problems_dict.get(problem_id, None)
                reports_changed = False
                lookedup_count += 1
                self.log_debug("Looked up existing problem #{0}"
                               .format(db_problem.id))
            else:
                matches = self._find_problem_matches(db_problems, problem)
                if not matches:
                    # No possible match found, must be a new problem
                    db_problem = Problem()
                    db.session.add(db_problem)
                    created_count += 1
                else:
                    # Leave the problems for the second phase
                    match_list += matches
                    second_pass.append(problem)
                    continue

            db_problems_used.add(db_problem)
            yield (problem, db_problem, reports_changed)

        # Phase two: yield problems in order of best match
        self.log_debug("Matching existing problems")
        self.log_debug("{0} possible matches".format(len(match_list)))
        for match_metric, problem, db_problem in sorted(match_list,
                                                        key=itemgetter(0),
                                                        reverse=True):
            if problem not in second_pass:
                self.log_debug("Already matched")
                continue
            if db_problem in db_problems_used:
                self.log_debug("Problem already used")
                continue
            found_count += 1
            second_pass.remove(problem)
            db_problems_used.add(db_problem)
            self.log_debug("Found existing problem #{0} ({1:.2f})"
                           .format(db_problem.id, match_metric))
            yield (problem, db_problem, True)

        # Phase three: create new problems if no match was found above
        self.log_debug("Processing {0} leftover problems"
                       .format(len(second_pass)))
        for problem in second_pass:
            self.log_debug("Creating problem")
            db_problem = Problem()
            db.session.add(db_problem)
            created_count += 1
            yield (problem, db_problem, True)

        self.log_debug("Total: {0}  Looked up: {1}  Found: {2}  Created: {3}"
                       .format(i, lookedup_count, found_count, created_count))
Пример #4
0
    def _create_problems(self, db, problemplugin,
                         report_min_count=0, speedup=False):
        if speedup:
            db_reports = get_reports_for_problems(db, problemplugin.name)
            db_reports += get_unassigned_reports(db, problemplugin.name,
                                                 min_count=report_min_count)
        else:
            db_reports = get_reports_by_type(db, problemplugin.name,
                                             min_count=report_min_count)
        db_problems = get_problems(db)

        # dict to get db_problem by problem_id
        self.log_debug("Creating problem reuse dict")
        problems_dict = {}
        for db_problem in db_problems:
            problems_dict[db_problem.id] = db_problem
        # dict to get report_ids by problem_id
        problem_report = defaultdict(list)
        for db_report in db_reports:
            if db_report.problem_id is not None:
                problem_report[db_report.problem_id].append(db_report.id)
        # create lookup dict for problems
        reuse_problems = {}
        for (problem_id, report_ids) in problem_report.items():
            reuse_problems[tuple(sorted(report_ids))] = problem_id

        invalid_report_ids_to_clean = []
        problems = []
        if not db_reports:
            self.log_info("No reports found")
        elif len(db_reports) == 1:
            db_report = db_reports[0]
            if db_report.problem is None:
                problems.append([db_report])
        else:
            report_map = {}
            _satyr_reports = []
            i = 0
            for db_report in db_reports:
                i += 1
                self.log_debug("[{0} / {1}] Loading report #{2}"
                               .format(i, len(db_reports), db_report.id))

                _satyr_report = problemplugin._db_report_to_satyr(db_report)
                if _satyr_report is None:
                    self.log_debug("Unable to create satyr report")
                    if db_report.problem_id is not None:
                        invalid_report_ids_to_clean.append(db_report.id)
                else:
                    _satyr_reports.append(_satyr_report)
                    report_map[_satyr_report] = db_report

                db.session.expire(db_report)

            self.log_debug("Clustering")
            clusters = self._create_clusters(_satyr_reports, 2000)
            # Threads that share no function with another thread
            unique_func_threads = set(_satyr_reports) - set().union(*clusters)

            dendrograms = []
            i = 0
            for cluster in clusters:
                i += 1
                self.log_debug("[{0} / {1}] Computing distances"
                               .format(i, len(clusters)))
                distances = satyr.Distances(cluster, len(cluster))

                self.log_debug("Getting dendrogram")
                dendrograms.append(satyr.Dendrogram(distances))

            for dendrogram, cluster in zip(dendrograms, clusters):
                problem = []
                for dups in dendrogram.cut(0.3, 1):
                    reports = set(report_map[cluster[dup]] for dup in dups)
                    problem.append(reports)

                problems.extend(problem)

            # Unique threads form their own unique problems
            for thread in unique_func_threads:
                problems.append({report_map[thread]})

        self.log_info("Creating problems from clusters")
        if speedup:
            for problem in problems:
                if not problem:
                    continue
                first_report = next(iter(problem))
                if len(problem) > 1:
                    # Find assigned report
                    origin_report = None
                    for db_report in problem:
                        if db_report.problem_id:
                            origin_report = db_report

                    # Problem created only from new reports
                    comps = {}
                    if not origin_report:
                        new = Problem()
                        db.session.add(new)
                        db.session.flush()
                        first_occurrence = first_report.first_occurrence
                        last_occurrence = first_report.last_occurrence
                        for rep in problem:
                            rep.problem_id = new.id

                            if first_occurrence > rep.first_occurrence:
                                first_occurrence = rep.first_occurrence
                            if last_occurrence < rep.last_occurrence:
                                last_occurrence = rep.last_occurrence

                            if rep.component not in comps:
                                comps[rep.component] = 0

                            comps[rep.component] += 1
                        self.update_comps(db, comps, new)
                        new.last_occurrence = last_occurrence
                        new.first_occurrence = first_occurrence

                    else:
                        first_occurrence = origin_report.first_occurrence
                        last_occurrence = origin_report.last_occurrence
                        for rep in problem:
                            if not rep.problem_id:
                                rep.problem_id = origin_report.problem_id

                                if first_occurrence > rep.first_occurrence:
                                    first_occurrence = rep.first_occurrence
                                if last_occurrence < rep.last_occurrence:
                                    last_occurrence = rep.last_occurrence

                                if rep.component not in comps:
                                    comps[rep.component] = 0

                                comps[rep.component] += 1
                        orig_p = get_problem_by_id(db, origin_report.problem_id)
                        self.update_comps(db, comps, orig_p)
                        orig_p.last_occurrence = last_occurrence
                        orig_p.first_occurrence = first_occurrence
                else:
                    # The report is assigned
                    if first_report.problem_id:
                        continue
                    else:
                        # One report that wasn't matched with anything else
                        new = Problem()
                        new.first_occurrence = first_report.first_occurrence
                        new.last_occurrence = first_report.last_occurrence
                        db.session.add(new)
                        db.session.flush()

                        self.update_comps(db, {first_report.component: 1}, new)
                        first_report.problem_id = new.id
            db.session.flush()

        else:
            for problem, db_problem, reports_changed in self._iter_problems(
                    db, problems, db_problems, problems_dict, reuse_problems):

                comps = {}

                problem_last_occurrence = None
                problem_first_occurrence = None
                for db_report in problem:
                    db_report.problem = db_problem

                    if (problem_last_occurrence is None or
                            problem_last_occurrence < db_report.last_occurrence):
                        problem_last_occurrence = db_report.last_occurrence

                    if (problem_first_occurrence is None or
                            problem_first_occurrence > db_report.first_occurrence):
                        problem_first_occurrence = db_report.first_occurrence

                    if db_report.component not in comps:
                        comps[db_report.component] = 0

                    comps[db_report.component] += 1

                # In case nothing changed, we don't want to mark db_problem
                # dirty which would cause another UPDATE
                if db_problem.first_occurrence != problem_first_occurrence:
                    db_problem.first_occurrence = problem_first_occurrence
                if db_problem.last_occurrence != problem_last_occurrence:
                    db_problem.last_occurrence = problem_last_occurrence

                if reports_changed:
                    self.update_comps(db, comps, db_problem)

            self.log_debug("Removing {0} invalid reports from problems"
                           .format(len(invalid_report_ids_to_clean)))
            for report_id in invalid_report_ids_to_clean:
                db_report = get_report_by_id(db, report_id)
                if db_report is not None:
                    db_report.problem_id = None
                    db.session.add(db_report)

            if report_min_count > 0:
                self.log_debug("Removing problems from low count reports")
                remove_problem_from_low_count_reports_by_type(db,
                                                              problemplugin.name,
                                                              min_count=report_min_count)

            self.log_debug("Flushing session")
            db.session.flush()
Пример #5
0
    def _create_problems(
            self,
            db,
            problemplugin,  #pylint: disable=too-many-statements
            report_min_count=0,
            speedup=False):
        if speedup:
            self.log_debug("[%s] Getting reports for problems",
                           problemplugin.name)
            db_reports = get_reports_for_problems(db, problemplugin.name)

            self.log_debug("[%s] Getting unassigned reports",
                           problemplugin.name)
            db_reports += get_unassigned_reports(db,
                                                 problemplugin.name,
                                                 min_count=report_min_count)
        else:
            db_reports = get_reports_by_type(db,
                                             problemplugin.name,
                                             min_count=report_min_count)
        db_problems = get_problems(db)

        # dict to get db_problem by problem_id
        self.log_debug("Creating problem reuse dict")
        problems_dict = {}
        for db_problem in db_problems:
            problems_dict[db_problem.id] = db_problem
        # dict to get report_ids by problem_id
        problem_report = defaultdict(list)
        for db_report in db_reports:
            if db_report.problem_id is not None:
                problem_report[db_report.problem_id].append(db_report.id)
        # create lookup dict for problems
        reuse_problems = {}
        for (problem_id, report_ids) in problem_report.items():
            reuse_problems[tuple(sorted(report_ids))] = problem_id

        invalid_report_ids_to_clean = []
        problems = []
        if not db_reports:
            self.log_info("No reports found")
        elif len(db_reports) == 1:
            db_report = db_reports[0]
            if db_report.problem is None:
                problems.append([db_report])
        else:
            report_map = {}
            _satyr_reports = []
            db_reports_len = len(db_reports)
            n_processed = 1

            # split the work to multiple workers
            with ThreadPoolExecutor(self._max_workers) as executor:
                # schedule db_reports for processing
                futures = {
                    executor.submit(problemplugin.db_report_to_satyr, report):
                    report
                    for report in db_reports
                }

                for future in as_completed(futures):
                    db_report = futures.pop(future)
                    self.log_debug("[%d / %d] Loading report #%d", n_processed,
                                   db_reports_len, db_report.id)

                    _satyr_report = future.result()
                    if _satyr_report is None:
                        self.log_debug("Unable to create satyr report")
                        if db_report.problem_id is not None:
                            invalid_report_ids_to_clean.append(db_report.id)
                    else:
                        _satyr_reports.append(_satyr_report)
                        report_map[_satyr_report] = db_report

                    n_processed += 1

                db.session.expire_all()

            self.log_debug("Clustering")
            clusters = self._create_clusters(_satyr_reports, 2000)
            # Threads that share no function with another thread
            unique_func_threads = set(_satyr_reports) - set().union(*clusters)

            dendrograms = []
            clusters_len = len(clusters)
            for i, cluster in enumerate(clusters, start=1):
                self.log_debug("[%d / %d] Computing distances", i,
                               clusters_len)
                distances = satyr.Distances(cluster, len(cluster))

                self.log_debug("Getting dendrogram")
                dendrograms.append(satyr.Dendrogram(distances))

            dendogram_cut = 0.3
            if speedup:
                dendogram_cut = dendogram_cut * 1.1

            for dendrogram, cluster in zip(dendrograms, clusters):
                problem = []
                for dups in dendrogram.cut(dendogram_cut, 1):
                    reports = set(report_map[cluster[dup]] for dup in dups)
                    problem.append(reports)

                problems.extend(problem)

            # Unique threads form their own unique problems
            for thread in unique_func_threads:
                problems.append({report_map[thread]})

        self.log_info("Creating problems from clusters")
        if speedup:
            for problem in problems:
                if not problem:
                    continue
                first_report = next(iter(problem))
                if len(problem) > 1:
                    # Find assigned report
                    origin_report = None
                    for db_report in problem:
                        if db_report.problem_id:
                            origin_report = db_report

                    # Problem created only from new reports
                    comps = {}
                    if not origin_report:
                        new = Problem()
                        db.session.add(new)
                        db.session.flush()
                        first_occurrence = first_report.first_occurrence
                        last_occurrence = first_report.last_occurrence
                        for rep in problem:
                            rep.problem_id = new.id

                            if first_occurrence > rep.first_occurrence:
                                first_occurrence = rep.first_occurrence
                            if last_occurrence < rep.last_occurrence:
                                last_occurrence = rep.last_occurrence

                            if rep.component not in comps:
                                comps[rep.component] = 0

                            comps[rep.component] += 1
                        self.update_comps(db, comps, new)
                        new.last_occurrence = last_occurrence
                        new.first_occurrence = first_occurrence

                    else:
                        first_occurrence = origin_report.first_occurrence
                        last_occurrence = origin_report.last_occurrence
                        for rep in problem:
                            if not rep.problem_id:
                                rep.problem_id = origin_report.problem_id

                                if first_occurrence > rep.first_occurrence:
                                    first_occurrence = rep.first_occurrence
                                if last_occurrence < rep.last_occurrence:
                                    last_occurrence = rep.last_occurrence

                                if rep.component not in comps:
                                    comps[rep.component] = 0

                                comps[rep.component] += 1
                        orig_p = get_problem_by_id(db,
                                                   origin_report.problem_id)
                        self.update_comps(db, comps, orig_p)
                        orig_p.last_occurrence = last_occurrence
                        orig_p.first_occurrence = first_occurrence
                else:
                    # The report is assigned
                    if first_report.problem_id:
                        continue
                    # One report that wasn't matched with anything else
                    new = Problem()
                    new.first_occurrence = first_report.first_occurrence
                    new.last_occurrence = first_report.last_occurrence
                    db.session.add(new)
                    db.session.flush()

                    self.update_comps(db, {first_report.component: 1}, new)
                    first_report.problem_id = new.id
            db.session.flush()

        else:
            for problem, db_problem, reports_changed in self._iter_problems(
                    db, problems, db_problems, problems_dict, reuse_problems):

                comps = {}

                problem_last_occurrence = None
                problem_first_occurrence = None
                for db_report in problem:
                    db_report.problem = db_problem

                    if (problem_last_occurrence is None
                            or problem_last_occurrence <
                            db_report.last_occurrence):
                        problem_last_occurrence = db_report.last_occurrence

                    if (problem_first_occurrence is None
                            or problem_first_occurrence >
                            db_report.first_occurrence):
                        problem_first_occurrence = db_report.first_occurrence

                    if db_report.component not in comps:
                        comps[db_report.component] = 0

                    comps[db_report.component] += 1

                # In case nothing changed, we don't want to mark db_problem
                # dirty which would cause another UPDATE
                if db_problem.first_occurrence != problem_first_occurrence:
                    db_problem.first_occurrence = problem_first_occurrence
                if db_problem.last_occurrence != problem_last_occurrence:
                    db_problem.last_occurrence = problem_last_occurrence

                if reports_changed:
                    self.update_comps(db, comps, db_problem)

            self.log_debug("Removing %d invalid reports from problems",
                           len(invalid_report_ids_to_clean))
            unassign_reports(db, invalid_report_ids_to_clean)

            if report_min_count > 0:
                self.log_debug("Removing problems from low count reports")
                remove_problem_from_low_count_reports_by_type(
                    db, problemplugin.name, min_count=report_min_count)

            self.log_debug("Flushing session")
            db.session.flush()
Пример #6
0
    def _create_problems(self, db, problemplugin):
        db_reports = get_reports_by_type(db, problemplugin.name)
        db_problems = get_problems(db)

        # dict to get db_problem by problem_id
        self.log_debug("Creating problem reuse dict")
        problems_dict = {}
        for db_problem in db_problems:
            problems_dict[db_problem.id] = db_problem
        # dict to get report_ids by problem_id
        problem_report = defaultdict(list)
        for db_report in db_reports:
            if db_report.problem_id is not None:
                problem_report[db_report.problem_id].append(db_report.id)
        # create lookup dict for problems
        reuse_problems = {}
        for (problem_id, report_ids) in problem_report.items():
            reuse_problems[tuple(sorted(report_ids))] = problem_id

        problems = []
        if len(db_reports) < 1:
            self.log_info("No reports found")
        elif len(db_reports) == 1:
            db_report = db_reports[0]
            if db_report.problem is None:
                problems.append([db_report])
        else:
            report_map = {}
            _satyr_reports = []
            i = 0
            for db_report in db_reports:
                i += 1
                self.log_debug("[{0} / {1}] Loading report #{2}"
                               .format(i, len(db_reports), db_report.id))

                _satyr_report = problemplugin._db_report_to_satyr(db_report)
                if _satyr_report is None:
                    self.log_debug("Unable to create satyr report")
                else:
                    _satyr_reports.append(_satyr_report)
                    report_map[_satyr_report] = db_report

                db.session.expire(db_report)

            self.log_debug("Clustering")
            clusters = self._create_clusters(_satyr_reports, 2000)
            unique_func_threads = set(_satyr_reports) - set().union(*clusters)

            dendrograms = []
            i = 0
            for cluster in clusters:
                i += 1
                self.log_debug("[{0} / {1}] Computing distances"
                               .format(i, len(clusters)))
                distances = satyr.Distances(cluster, len(cluster))

                self.log_debug("Getting dendrogram")
                dendrograms.append(satyr.Dendrogram(distances))

            for dendrogram, cluster in zip(dendrograms, clusters):
                problem = []
                for dups in dendrogram.cut(0.3, 1):
                    reports = set(report_map[cluster[dup]] for dup in dups)
                    problem.append(reports)

                problems.extend(problem)

            for thread in unique_func_threads:
                problems.append(set([report_map[thread]]))

        self.log_info("Creating problems")
        i = 0
        lookedup_count = 0
        found_count = 0
        created_count = 0
        for problem in problems:
            i += 1

            self.log_debug("[{0} / {1}] Creating problem"
                           .format(i, len(problems)))
            comps = {}

            reports_changed = True
            problem_id = reuse_problems.get(
                tuple(sorted([db_report.id for db_report in problem])), None)
            if problem_id is not None:
                db_problem = problems_dict.get(problem_id, None)
                reports_changed = False
                lookedup_count += 1
                self.log_debug("Looked up existing problem #{0}"
                               .format(db_problem.id))
            else:
                db_problem = self._find_problem(db_problems, problem)
                found_count += 1

            if db_problem is None:
                db_problem = Problem()
                db.session.add(db_problem)

                db_problems.append(db_problem)
                created_count += 1

            for db_report in problem:
                db_report.problem = db_problem

                if (db_problem.last_occurrence is None or
                    db_problem.last_occurrence < db_report.last_occurrence):
                    db_problem.last_occurrence = db_report.last_occurrence

                if (db_problem.first_occurrence is None or
                    db_problem.first_occurrence < db_report.first_occurrence):
                    db_problem.first_occurrence = db_report.first_occurrence

                if db_report.component not in comps:
                    comps[db_report.component] = 0

                comps[db_report.component] += 1

            if reports_changed:
                db_comps = sorted(comps, key=lambda x: comps[x], reverse=True)

                order = 0
                for db_component in db_comps:
                    order += 1

                    db_pcomp = get_problem_component(db, db_problem, db_component)
                    if db_pcomp is None:
                        db_pcomp = ProblemComponent()
                        db_pcomp.problem = db_problem
                        db_pcomp.component = db_component
                        db_pcomp.order = order
                        db.session.add(db_pcomp)

        self.log_debug("Total: {0}  Looked up: {1}  Found: {2}  Created: {3}"
                       .format(i, lookedup_count, found_count, created_count))
        self.log_debug("Flushing session")
        db.session.flush()