Exemplo n.º 1
0
    def run(self, cmdline, db):
        """
        Mark a problem probably fixed if there is a new build of the problem's
        affected package, for which no crash reports have come in.
        """

        try:
            tasks = self._get_tasks(cmdline, db)
        except FafError as ex:
            self.log_error(
                "Unable to process command line arguments: {0}".format(
                    str(ex)))
            return 1

        problems = get_problems(db)

        task_i = 0
        for osplugin, db_release in tasks:
            task_i += 1

            self.log_info("[{0} / {1}] Processing '{2} {3}'".format(
                task_i, len(tasks), osplugin.nice_name, db_release.version))

            self.log_debug("Getting builds...")
            opsys_builds = osplugin.get_released_builds(db_release.version)

            newest_builds = {}
            all_builds = {}
            now = datetime.now()
            for build in opsys_builds:
                age = now - build["completion_time"]
                # If a hot new build comes out, we need to wait a certain
                # period of time for people to use it before we can make
                # conclusions about it being a probable fix.
                if age.days >= osplugin.build_aging_days:
                    if build["name"] not in newest_builds:
                        newest_builds[build["name"]] = build

                    if build["name"] not in all_builds:
                        all_builds[build["name"]] = [
                            build,
                        ]
                    else:
                        all_builds[build["name"]].append(build)

            probably_fixed_total = 0
            problems_in_release = 0
            problem_counter = 0
            for problem in problems:
                problem_counter += 1
                self.log_debug("Processing problem ID:{0} {1}/{2}:".format(
                    problem.id, problem_counter, len(problems)))
                affected_newest = {}
                affected_not_found = False

                reports_for_release =  \
                    get_reports_for_opsysrelease(db, problem.id, db_release.id)

                # For all the reports, we need the affected packages and their
                # newest versions.
                if reports_for_release:
                    problems_in_release += 1
                else:
                    self.log_debug(
                        " This problem doesn't appear in this release.")
                    self._save_probable_fix(db, problem, db_release, None)
                    # Next problem
                    continue

                for report in reports_for_release:
                    # First we try to find the affected package among the known
                    # packages.
                    affected_known = [
                        (affected.build.base_package_name,
                         affected.build.epoch, affected.build.version,
                         affected.build.release)
                        for affected in get_crashed_package_for_report(
                            db, report.id)
                    ]

                    # Then among the unknown packages.
                    affected_unknown = \
                        get_crashed_unknown_package_nevr_for_report(db, report.id)
                    # We get the base package name directly from the report
                    affected_unknown = [(report.component.name, affected[1],
                                         affected[2], affected[3])
                                        for affected in affected_unknown]

                    affected_all = affected_known + affected_unknown
                    if not affected_all:
                        affected_not_found = True
                        break

                    for affected in affected_all:
                        if affected[0] in affected_newest:
                            # If a problem contains multiple reports with the same
                            # affected package, we only want the newest version of
                            # it.
                            affected_newest[affected[0]]['reports'].append(
                                report)
                            if cmp_evr(
                                    affected[1:], affected_newest[affected[0]]
                                ['nevr'][1:]) > 0:
                                affected_newest[affected[0]]['nevr'] = affected
                        else:
                            affected_newest[affected[0]] = {
                                'reports': [
                                    report,
                                ],
                                'nevr': affected
                            }

                if affected_not_found or not affected_newest:
                    # Affected package of one of the reports was not found.
                    # We can't make any conclusions.
                    self.log_debug(" Affected package not found.")
                    self._save_probable_fix(db, problem, db_release, None)
                    # Next problem
                    continue

                if len(affected_newest) > 1:
                    # Multiple different affected packages => cannot be fixed
                    # by a single package update
                    self.log_debug(
                        " Multiple affected packages. No simple fix.")
                    self._save_probable_fix(db, problem, db_release, None)
                    # Next problem
                    continue

                probably_fixed_since = datetime.fromtimestamp(0)

                pkg = list(affected_newest.values())[0]

                name = pkg['nevr'][0]
                newest_build = newest_builds.get(name, False)
                if newest_build:
                    newest_evr = (newest_build["epoch"]
                                  or 0, newest_build["version"],
                                  newest_build["release"])
                if newest_build and cmp_evr(newest_evr, pkg['nevr'][1:]) > 0:
                    # Newest available build is newer than the newest version
                    # of the affected package. Now find the oldest such
                    # probable fix.
                    i = 0
                    while i < len(all_builds[name]) and cmp_evr(
                        (all_builds[name][i]["epoch"]
                         or 0, all_builds[name][i]["version"],
                         all_builds[name][i]["release"]), pkg['nevr'][1:]) > 0:
                        i += 1
                    completion_time = all_builds[name][i -
                                                       1]["completion_time"]
                    probably_fixed_since = max(completion_time,
                                               probably_fixed_since)
                    pkg["probable_fix"] = (name,
                                           all_builds[name][i - 1]["epoch"]
                                           or 0,
                                           all_builds[name][i - 1]["version"],
                                           all_builds[name][i - 1]["release"])

                    self._save_probable_fix(db, problem, db_release,
                                            pkg["probable_fix"],
                                            probably_fixed_since)
                    self.log_debug("  Probably fixed for {0} days.".format(
                        (datetime.now() - probably_fixed_since).days))
                    probably_fixed_total += 1
                else:
                    self._save_probable_fix(db, problem, db_release, None)
                    self.log_debug("  Not fixed.")

            db.session.flush()
            if problems_in_release > 0:
                self.log_info(
                    "{0}% of problems in this release probably fixed.".format(
                        (probably_fixed_total * 100) // problems_in_release))
            else:
                self.log_info("No problems found in this release.")
        return 0
Exemplo n.º 2
0
    def _create_problems(self, db, problemplugin,
                         report_min_count=0, speedup=False):
        if speedup:
            db_reports = get_reports_for_problems(db, problemplugin.name)
            db_reports += get_unassigned_reports(db, problemplugin.name,
                                                 min_count=report_min_count)
        else:
            db_reports = get_reports_by_type(db, problemplugin.name,
                                             min_count=report_min_count)
        db_problems = get_problems(db)

        # dict to get db_problem by problem_id
        self.log_debug("Creating problem reuse dict")
        problems_dict = {}
        for db_problem in db_problems:
            problems_dict[db_problem.id] = db_problem
        # dict to get report_ids by problem_id
        problem_report = defaultdict(list)
        for db_report in db_reports:
            if db_report.problem_id is not None:
                problem_report[db_report.problem_id].append(db_report.id)
        # create lookup dict for problems
        reuse_problems = {}
        for (problem_id, report_ids) in problem_report.items():
            reuse_problems[tuple(sorted(report_ids))] = problem_id

        invalid_report_ids_to_clean = []
        problems = []
        if not db_reports:
            self.log_info("No reports found")
        elif len(db_reports) == 1:
            db_report = db_reports[0]
            if db_report.problem is None:
                problems.append([db_report])
        else:
            report_map = {}
            _satyr_reports = []
            i = 0
            for db_report in db_reports:
                i += 1
                self.log_debug("[{0} / {1}] Loading report #{2}"
                               .format(i, len(db_reports), db_report.id))

                _satyr_report = problemplugin._db_report_to_satyr(db_report)
                if _satyr_report is None:
                    self.log_debug("Unable to create satyr report")
                    if db_report.problem_id is not None:
                        invalid_report_ids_to_clean.append(db_report.id)
                else:
                    _satyr_reports.append(_satyr_report)
                    report_map[_satyr_report] = db_report

                db.session.expire(db_report)

            self.log_debug("Clustering")
            clusters = self._create_clusters(_satyr_reports, 2000)
            # Threads that share no function with another thread
            unique_func_threads = set(_satyr_reports) - set().union(*clusters)

            dendrograms = []
            i = 0
            for cluster in clusters:
                i += 1
                self.log_debug("[{0} / {1}] Computing distances"
                               .format(i, len(clusters)))
                distances = satyr.Distances(cluster, len(cluster))

                self.log_debug("Getting dendrogram")
                dendrograms.append(satyr.Dendrogram(distances))

            for dendrogram, cluster in zip(dendrograms, clusters):
                problem = []
                for dups in dendrogram.cut(0.3, 1):
                    reports = set(report_map[cluster[dup]] for dup in dups)
                    problem.append(reports)

                problems.extend(problem)

            # Unique threads form their own unique problems
            for thread in unique_func_threads:
                problems.append({report_map[thread]})

        self.log_info("Creating problems from clusters")
        if speedup:
            for problem in problems:
                if not problem:
                    continue
                first_report = next(iter(problem))
                if len(problem) > 1:
                    # Find assigned report
                    origin_report = None
                    for db_report in problem:
                        if db_report.problem_id:
                            origin_report = db_report

                    # Problem created only from new reports
                    comps = {}
                    if not origin_report:
                        new = Problem()
                        db.session.add(new)
                        db.session.flush()
                        first_occurrence = first_report.first_occurrence
                        last_occurrence = first_report.last_occurrence
                        for rep in problem:
                            rep.problem_id = new.id

                            if first_occurrence > rep.first_occurrence:
                                first_occurrence = rep.first_occurrence
                            if last_occurrence < rep.last_occurrence:
                                last_occurrence = rep.last_occurrence

                            if rep.component not in comps:
                                comps[rep.component] = 0

                            comps[rep.component] += 1
                        self.update_comps(db, comps, new)
                        new.last_occurrence = last_occurrence
                        new.first_occurrence = first_occurrence

                    else:
                        first_occurrence = origin_report.first_occurrence
                        last_occurrence = origin_report.last_occurrence
                        for rep in problem:
                            if not rep.problem_id:
                                rep.problem_id = origin_report.problem_id

                                if first_occurrence > rep.first_occurrence:
                                    first_occurrence = rep.first_occurrence
                                if last_occurrence < rep.last_occurrence:
                                    last_occurrence = rep.last_occurrence

                                if rep.component not in comps:
                                    comps[rep.component] = 0

                                comps[rep.component] += 1
                        orig_p = get_problem_by_id(db, origin_report.problem_id)
                        self.update_comps(db, comps, orig_p)
                        orig_p.last_occurrence = last_occurrence
                        orig_p.first_occurrence = first_occurrence
                else:
                    # The report is assigned
                    if first_report.problem_id:
                        continue
                    else:
                        # One report that wasn't matched with anything else
                        new = Problem()
                        new.first_occurrence = first_report.first_occurrence
                        new.last_occurrence = first_report.last_occurrence
                        db.session.add(new)
                        db.session.flush()

                        self.update_comps(db, {first_report.component: 1}, new)
                        first_report.problem_id = new.id
            db.session.flush()

        else:
            for problem, db_problem, reports_changed in self._iter_problems(
                    db, problems, db_problems, problems_dict, reuse_problems):

                comps = {}

                problem_last_occurrence = None
                problem_first_occurrence = None
                for db_report in problem:
                    db_report.problem = db_problem

                    if (problem_last_occurrence is None or
                            problem_last_occurrence < db_report.last_occurrence):
                        problem_last_occurrence = db_report.last_occurrence

                    if (problem_first_occurrence is None or
                            problem_first_occurrence > db_report.first_occurrence):
                        problem_first_occurrence = db_report.first_occurrence

                    if db_report.component not in comps:
                        comps[db_report.component] = 0

                    comps[db_report.component] += 1

                # In case nothing changed, we don't want to mark db_problem
                # dirty which would cause another UPDATE
                if db_problem.first_occurrence != problem_first_occurrence:
                    db_problem.first_occurrence = problem_first_occurrence
                if db_problem.last_occurrence != problem_last_occurrence:
                    db_problem.last_occurrence = problem_last_occurrence

                if reports_changed:
                    self.update_comps(db, comps, db_problem)

            self.log_debug("Removing {0} invalid reports from problems"
                           .format(len(invalid_report_ids_to_clean)))
            for report_id in invalid_report_ids_to_clean:
                db_report = get_report_by_id(db, report_id)
                if db_report is not None:
                    db_report.problem_id = None
                    db.session.add(db_report)

            if report_min_count > 0:
                self.log_debug("Removing problems from low count reports")
                remove_problem_from_low_count_reports_by_type(db,
                                                              problemplugin.name,
                                                              min_count=report_min_count)

            self.log_debug("Flushing session")
            db.session.flush()
Exemplo n.º 3
0
    def _create_problems(self, db, problemplugin,
                         report_min_count=0, speedup=False):
        if speedup:
            db_reports = get_reports_for_problems(db, problemplugin.name)
            db_reports += get_unassigned_reports(db, problemplugin.name,
                                                 min_count=report_min_count)
        else:
            db_reports = get_reports_by_type(db, problemplugin.name,
                                             min_count=report_min_count)
        db_problems = get_problems(db)

        # dict to get db_problem by problem_id
        self.log_debug("Creating problem reuse dict")
        problems_dict = {}
        for db_problem in db_problems:
            problems_dict[db_problem.id] = db_problem
        # dict to get report_ids by problem_id
        problem_report = defaultdict(list)
        for db_report in db_reports:
            if db_report.problem_id is not None:
                problem_report[db_report.problem_id].append(db_report.id)
        # create lookup dict for problems
        reuse_problems = {}
        for (problem_id, report_ids) in problem_report.items():
            reuse_problems[tuple(sorted(report_ids))] = problem_id

        invalid_report_ids_to_clean = []
        problems = []
        if not db_reports:
            self.log_info("No reports found")
        elif len(db_reports) == 1:
            db_report = db_reports[0]
            if db_report.problem is None:
                problems.append([db_report])
        else:
            report_map = {}
            _satyr_reports = []
            i = 0
            for db_report in db_reports:
                i += 1
                self.log_debug("[{0} / {1}] Loading report #{2}"
                               .format(i, len(db_reports), db_report.id))

                _satyr_report = problemplugin._db_report_to_satyr(db_report)
                if _satyr_report is None:
                    self.log_debug("Unable to create satyr report")
                    if db_report.problem_id is not None:
                        invalid_report_ids_to_clean.append(db_report.id)
                else:
                    _satyr_reports.append(_satyr_report)
                    report_map[_satyr_report] = db_report

                db.session.expire(db_report)

            self.log_debug("Clustering")
            clusters = self._create_clusters(_satyr_reports, 2000)
            # Threads that share no function with another thread
            unique_func_threads = set(_satyr_reports) - set().union(*clusters)

            dendrograms = []
            i = 0
            for cluster in clusters:
                i += 1
                self.log_debug("[{0} / {1}] Computing distances"
                               .format(i, len(clusters)))
                distances = satyr.Distances(cluster, len(cluster))

                self.log_debug("Getting dendrogram")
                dendrograms.append(satyr.Dendrogram(distances))

            for dendrogram, cluster in zip(dendrograms, clusters):
                problem = []
                for dups in dendrogram.cut(0.3, 1):
                    reports = set(report_map[cluster[dup]] for dup in dups)
                    problem.append(reports)

                problems.extend(problem)

            # Unique threads form their own unique problems
            for thread in unique_func_threads:
                problems.append({report_map[thread]})

        self.log_info("Creating problems from clusters")
        if speedup:
            for problem in problems:
                if not problem:
                    continue
                first_report = next(iter(problem))
                if len(problem) > 1:
                    # Find assigned report
                    origin_report = None
                    for db_report in problem:
                        if db_report.problem_id:
                            origin_report = db_report

                    # Problem created only from new reports
                    comps = {}
                    if not origin_report:
                        new = Problem()
                        db.session.add(new)
                        db.session.flush()
                        first_occurrence = first_report.first_occurrence
                        last_occurrence = first_report.last_occurrence
                        for rep in problem:
                            rep.problem_id = new.id

                            if first_occurrence > rep.first_occurrence:
                                first_occurrence = rep.first_occurrence
                            if last_occurrence < rep.last_occurrence:
                                last_occurrence = rep.last_occurrence

                            if rep.component not in comps:
                                comps[rep.component] = 0

                            comps[rep.component] += 1
                        self.update_comps(db, comps, new)
                        new.last_occurrence = last_occurrence
                        new.first_occurrence = first_occurrence

                    else:
                        first_occurrence = origin_report.first_occurrence
                        last_occurrence = origin_report.last_occurrence
                        for rep in problem:
                            if not rep.problem_id:
                                rep.problem_id = origin_report.problem_id

                                if first_occurrence > rep.first_occurrence:
                                    first_occurrence = rep.first_occurrence
                                if last_occurrence < rep.last_occurrence:
                                    last_occurrence = rep.last_occurrence

                                if rep.component not in comps:
                                    comps[rep.component] = 0

                                comps[rep.component] += 1
                        orig_p = get_problem_by_id(db, origin_report.problem_id)
                        self.update_comps(db, comps, orig_p)
                        orig_p.last_occurrence = last_occurrence
                        orig_p.first_occurrence = first_occurrence
                else:
                    # The report is assigned
                    if first_report.problem_id:
                        continue
                    else:
                        # One report that wasn't matched with anything else
                        new = Problem()
                        new.first_occurrence = first_report.first_occurrence
                        new.last_occurrence = first_report.last_occurrence
                        db.session.add(new)
                        db.session.flush()

                        self.update_comps(db, {first_report.component: 1}, new)
                        first_report.problem_id = new.id
            db.session.flush()

        else:
            for problem, db_problem, reports_changed in self._iter_problems(
                    db, problems, db_problems, problems_dict, reuse_problems):

                comps = {}

                problem_last_occurrence = None
                problem_first_occurrence = None
                for db_report in problem:
                    db_report.problem = db_problem

                    if (problem_last_occurrence is None or
                            problem_last_occurrence < db_report.last_occurrence):
                        problem_last_occurrence = db_report.last_occurrence

                    if (problem_first_occurrence is None or
                            problem_first_occurrence > db_report.first_occurrence):
                        problem_first_occurrence = db_report.first_occurrence

                    if db_report.component not in comps:
                        comps[db_report.component] = 0

                    comps[db_report.component] += 1

                # In case nothing changed, we don't want to mark db_problem
                # dirty which would cause another UPDATE
                if db_problem.first_occurrence != problem_first_occurrence:
                    db_problem.first_occurrence = problem_first_occurrence
                if db_problem.last_occurrence != problem_last_occurrence:
                    db_problem.last_occurrence = problem_last_occurrence

                if reports_changed:
                    self.update_comps(db, comps, db_problem)

            self.log_debug("Removing {0} invalid reports from problems"
                           .format(len(invalid_report_ids_to_clean)))
            for report_id in invalid_report_ids_to_clean:
                db_report = get_report_by_id(db, report_id)
                if db_report is not None:
                    db_report.problem_id = None
                    db.session.add(db_report)

            if report_min_count > 0:
                self.log_debug("Removing problems from low count reports")
                remove_problem_from_low_count_reports_by_type(db,
                                                              problemplugin.name,
                                                              min_count=report_min_count)

            self.log_debug("Flushing session")
            db.session.flush()
Exemplo n.º 4
0
    def run(self, cmdline, db):
        """
        Mark a problem probably fixed if there is a new build of the problem's
        affected package, for which no crash reports have come in.
        """

        try:
            tasks = self._get_tasks(cmdline, db)
        except FafError as ex:
            self.log_error("Unable to process command line arguments: {0}"
                           .format(str(ex)))
            return 1

        problems = get_problems(db)

        task_i = 0
        for osplugin, db_release in tasks:
            task_i += 1

            self.log_info("[{0} / {1}] Processing '{2} {3}'"
                          .format(task_i, len(tasks), osplugin.nice_name,
                                  db_release.version))

            self.log_debug("Getting builds...")
            opsys_builds = osplugin.get_released_builds(db_release.version)

            newest_builds = {}
            all_builds = {}
            now = datetime.now()
            for build in opsys_builds:
                age = now - build["completion_time"]
                # If a hot new build comes out, we need to wait a certain
                # period of time for people to use it before we can make
                # conclusions about it being a probable fix.
                if age.days >= osplugin.build_aging_days:
                    if build["name"] not in newest_builds:
                        newest_builds[build["name"]] = build

                    if build["name"] not in all_builds:
                        all_builds[build["name"]] = [build, ]
                    else:
                        all_builds[build["name"]].append(build)

            probably_fixed_total = 0
            problems_in_release = 0
            problem_counter = 0
            for problem in problems:
                problem_counter += 1
                self.log_debug("Processing problem ID:{0} {1}/{2}:"
                               .format(problem.id, problem_counter, len(problems)))
                affected_newest = {}
                affected_not_found = False

                reports_for_release =  \
                    get_reports_for_opsysrelease(db, problem.id, db_release.id)

                # For all the reports, we need the affected packages and their
                # newest versions.
                if reports_for_release:
                    problems_in_release += 1
                else:
                    self.log_debug(" This problem doesn't appear in this release.")
                    self._save_probable_fix(db, problem, db_release, None)
                    # Next problem
                    continue

                for report in reports_for_release:
                    # First we try to find the affected package among the known
                    # packages.
                    affected_known = [
                        (affected.build.base_package_name,
                         affected.build.epoch,
                         affected.build.version,
                         affected.build.release) for affected in
                        get_crashed_package_for_report(db, report.id)]

                    # Then among the unknown packages.
                    affected_unknown = \
                        get_crashed_unknown_package_nevr_for_report(db, report.id)
                    # We get the base package name directly from the report
                    affected_unknown = [(report.component.name,
                                         affected[1],
                                         affected[2],
                                         affected[3]) for affected in affected_unknown]

                    affected_all = affected_known + affected_unknown
                    if not affected_all:
                        affected_not_found = True
                        break

                    for affected in affected_all:
                        if affected[0] in affected_newest:
                            # If a problem contains multiple reports with the same
                            # affected package, we only want the newest version of
                            # it.
                            affected_newest[affected[0]]['reports'].append(report)
                            if cmp_evr(affected[1:],
                                       affected_newest[affected[0]]['nevr'][1:]) > 0:
                                affected_newest[affected[0]]['nevr'] = affected
                        else:
                            affected_newest[affected[0]] = {
                                'reports': [report, ],
                                'nevr': affected
                            }

                if affected_not_found or not affected_newest:
                    # Affected package of one of the reports was not found.
                    # We can't make any conclusions.
                    self.log_debug(" Affected package not found.")
                    self._save_probable_fix(db, problem, db_release, None)
                    # Next problem
                    continue

                if len(affected_newest) > 1:
                    # Multiple different affected packages => cannot be fixed
                    # by a single package update
                    self.log_debug(" Multiple affected packages. No simple fix.")
                    self._save_probable_fix(db, problem, db_release, None)
                    # Next problem
                    continue

                probably_fixed_since = datetime.fromtimestamp(0)

                pkg = list(affected_newest.values())[0]

                name = pkg['nevr'][0]
                newest_build = newest_builds.get(name, False)
                if newest_build:
                    newest_evr = (newest_build["epoch"] or 0,
                                  newest_build["version"],
                                  newest_build["release"])
                if newest_build and cmp_evr(newest_evr, pkg['nevr'][1:]) > 0:
                    # Newest available build is newer than the newest version
                    # of the affected package. Now find the oldest such
                    # probable fix.
                    i = 0
                    while i < len(all_builds[name]) and cmp_evr(
                            (all_builds[name][i]["epoch"] or 0,
                             all_builds[name][i]["version"],
                             all_builds[name][i]["release"]), pkg['nevr'][1:]) > 0:
                        i += 1
                    completion_time = all_builds[name][i-1]["completion_time"]
                    probably_fixed_since = max(completion_time,
                                               probably_fixed_since)
                    pkg["probable_fix"] = (name,
                                           all_builds[name][i-1]["epoch"] or 0,
                                           all_builds[name][i-1]["version"],
                                           all_builds[name][i-1]["release"])

                    self._save_probable_fix(db, problem, db_release,
                                            pkg["probable_fix"],
                                            probably_fixed_since)
                    self.log_debug("  Probably fixed for {0} days.".format(
                        (datetime.now() - probably_fixed_since).days))
                    probably_fixed_total += 1
                else:
                    self._save_probable_fix(db, problem, db_release, None)
                    self.log_debug("  Not fixed.")

            db.session.flush()
            if problems_in_release > 0:
                self.log_info("{0}% of problems in this release probably fixed.".format(
                    (probably_fixed_total * 100) // problems_in_release))
            else:
                self.log_info("No problems found in this release.")
Exemplo n.º 5
0
    def _create_problems(
            self,
            db,
            problemplugin,  #pylint: disable=too-many-statements
            report_min_count=0,
            speedup=False):
        if speedup:
            self.log_debug("[%s] Getting reports for problems",
                           problemplugin.name)
            db_reports = get_reports_for_problems(db, problemplugin.name)

            self.log_debug("[%s] Getting unassigned reports",
                           problemplugin.name)
            db_reports += get_unassigned_reports(db,
                                                 problemplugin.name,
                                                 min_count=report_min_count)
        else:
            db_reports = get_reports_by_type(db,
                                             problemplugin.name,
                                             min_count=report_min_count)
        db_problems = get_problems(db)

        # dict to get db_problem by problem_id
        self.log_debug("Creating problem reuse dict")
        problems_dict = {}
        for db_problem in db_problems:
            problems_dict[db_problem.id] = db_problem
        # dict to get report_ids by problem_id
        problem_report = defaultdict(list)
        for db_report in db_reports:
            if db_report.problem_id is not None:
                problem_report[db_report.problem_id].append(db_report.id)
        # create lookup dict for problems
        reuse_problems = {}
        for (problem_id, report_ids) in problem_report.items():
            reuse_problems[tuple(sorted(report_ids))] = problem_id

        invalid_report_ids_to_clean = []
        problems = []
        if not db_reports:
            self.log_info("No reports found")
        elif len(db_reports) == 1:
            db_report = db_reports[0]
            if db_report.problem is None:
                problems.append([db_report])
        else:
            report_map = {}
            _satyr_reports = []
            db_reports_len = len(db_reports)
            n_processed = 1

            # split the work to multiple workers
            with ThreadPoolExecutor(self._max_workers) as executor:
                # schedule db_reports for processing
                futures = {
                    executor.submit(problemplugin.db_report_to_satyr, report):
                    report
                    for report in db_reports
                }

                for future in as_completed(futures):
                    db_report = futures.pop(future)
                    self.log_debug("[%d / %d] Loading report #%d", n_processed,
                                   db_reports_len, db_report.id)

                    _satyr_report = future.result()
                    if _satyr_report is None:
                        self.log_debug("Unable to create satyr report")
                        if db_report.problem_id is not None:
                            invalid_report_ids_to_clean.append(db_report.id)
                    else:
                        _satyr_reports.append(_satyr_report)
                        report_map[_satyr_report] = db_report

                    n_processed += 1

                db.session.expire_all()

            self.log_debug("Clustering")
            clusters = self._create_clusters(_satyr_reports, 2000)
            # Threads that share no function with another thread
            unique_func_threads = set(_satyr_reports) - set().union(*clusters)

            dendrograms = []
            clusters_len = len(clusters)
            for i, cluster in enumerate(clusters, start=1):
                self.log_debug("[%d / %d] Computing distances", i,
                               clusters_len)
                distances = satyr.Distances(cluster, len(cluster))

                self.log_debug("Getting dendrogram")
                dendrograms.append(satyr.Dendrogram(distances))

            dendogram_cut = 0.3
            if speedup:
                dendogram_cut = dendogram_cut * 1.1

            for dendrogram, cluster in zip(dendrograms, clusters):
                problem = []
                for dups in dendrogram.cut(dendogram_cut, 1):
                    reports = set(report_map[cluster[dup]] for dup in dups)
                    problem.append(reports)

                problems.extend(problem)

            # Unique threads form their own unique problems
            for thread in unique_func_threads:
                problems.append({report_map[thread]})

        self.log_info("Creating problems from clusters")
        if speedup:
            for problem in problems:
                if not problem:
                    continue
                first_report = next(iter(problem))
                if len(problem) > 1:
                    # Find assigned report
                    origin_report = None
                    for db_report in problem:
                        if db_report.problem_id:
                            origin_report = db_report

                    # Problem created only from new reports
                    comps = {}
                    if not origin_report:
                        new = Problem()
                        db.session.add(new)
                        db.session.flush()
                        first_occurrence = first_report.first_occurrence
                        last_occurrence = first_report.last_occurrence
                        for rep in problem:
                            rep.problem_id = new.id

                            if first_occurrence > rep.first_occurrence:
                                first_occurrence = rep.first_occurrence
                            if last_occurrence < rep.last_occurrence:
                                last_occurrence = rep.last_occurrence

                            if rep.component not in comps:
                                comps[rep.component] = 0

                            comps[rep.component] += 1
                        self.update_comps(db, comps, new)
                        new.last_occurrence = last_occurrence
                        new.first_occurrence = first_occurrence

                    else:
                        first_occurrence = origin_report.first_occurrence
                        last_occurrence = origin_report.last_occurrence
                        for rep in problem:
                            if not rep.problem_id:
                                rep.problem_id = origin_report.problem_id

                                if first_occurrence > rep.first_occurrence:
                                    first_occurrence = rep.first_occurrence
                                if last_occurrence < rep.last_occurrence:
                                    last_occurrence = rep.last_occurrence

                                if rep.component not in comps:
                                    comps[rep.component] = 0

                                comps[rep.component] += 1
                        orig_p = get_problem_by_id(db,
                                                   origin_report.problem_id)
                        self.update_comps(db, comps, orig_p)
                        orig_p.last_occurrence = last_occurrence
                        orig_p.first_occurrence = first_occurrence
                else:
                    # The report is assigned
                    if first_report.problem_id:
                        continue
                    # One report that wasn't matched with anything else
                    new = Problem()
                    new.first_occurrence = first_report.first_occurrence
                    new.last_occurrence = first_report.last_occurrence
                    db.session.add(new)
                    db.session.flush()

                    self.update_comps(db, {first_report.component: 1}, new)
                    first_report.problem_id = new.id
            db.session.flush()

        else:
            for problem, db_problem, reports_changed in self._iter_problems(
                    db, problems, db_problems, problems_dict, reuse_problems):

                comps = {}

                problem_last_occurrence = None
                problem_first_occurrence = None
                for db_report in problem:
                    db_report.problem = db_problem

                    if (problem_last_occurrence is None
                            or problem_last_occurrence <
                            db_report.last_occurrence):
                        problem_last_occurrence = db_report.last_occurrence

                    if (problem_first_occurrence is None
                            or problem_first_occurrence >
                            db_report.first_occurrence):
                        problem_first_occurrence = db_report.first_occurrence

                    if db_report.component not in comps:
                        comps[db_report.component] = 0

                    comps[db_report.component] += 1

                # In case nothing changed, we don't want to mark db_problem
                # dirty which would cause another UPDATE
                if db_problem.first_occurrence != problem_first_occurrence:
                    db_problem.first_occurrence = problem_first_occurrence
                if db_problem.last_occurrence != problem_last_occurrence:
                    db_problem.last_occurrence = problem_last_occurrence

                if reports_changed:
                    self.update_comps(db, comps, db_problem)

            self.log_debug("Removing %d invalid reports from problems",
                           len(invalid_report_ids_to_clean))
            unassign_reports(db, invalid_report_ids_to_clean)

            if report_min_count > 0:
                self.log_debug("Removing problems from low count reports")
                remove_problem_from_low_count_reports_by_type(
                    db, problemplugin.name, min_count=report_min_count)

            self.log_debug("Flushing session")
            db.session.flush()
Exemplo n.º 6
0
    def _create_problems(self, db, problemplugin):
        db_reports = get_reports_by_type(db, problemplugin.name)
        db_problems = get_problems(db)

        # dict to get db_problem by problem_id
        self.log_debug("Creating problem reuse dict")
        problems_dict = {}
        for db_problem in db_problems:
            problems_dict[db_problem.id] = db_problem
        # dict to get report_ids by problem_id
        problem_report = defaultdict(list)
        for db_report in db_reports:
            if db_report.problem_id is not None:
                problem_report[db_report.problem_id].append(db_report.id)
        # create lookup dict for problems
        reuse_problems = {}
        for (problem_id, report_ids) in problem_report.items():
            reuse_problems[tuple(sorted(report_ids))] = problem_id

        problems = []
        if len(db_reports) < 1:
            self.log_info("No reports found")
        elif len(db_reports) == 1:
            db_report = db_reports[0]
            if db_report.problem is None:
                problems.append([db_report])
        else:
            report_map = {}
            _satyr_reports = []
            i = 0
            for db_report in db_reports:
                i += 1
                self.log_debug("[{0} / {1}] Loading report #{2}"
                               .format(i, len(db_reports), db_report.id))

                _satyr_report = problemplugin._db_report_to_satyr(db_report)
                if _satyr_report is None:
                    self.log_debug("Unable to create satyr report")
                else:
                    _satyr_reports.append(_satyr_report)
                    report_map[_satyr_report] = db_report

                db.session.expire(db_report)

            self.log_debug("Clustering")
            clusters = self._create_clusters(_satyr_reports, 2000)
            unique_func_threads = set(_satyr_reports) - set().union(*clusters)

            dendrograms = []
            i = 0
            for cluster in clusters:
                i += 1
                self.log_debug("[{0} / {1}] Computing distances"
                               .format(i, len(clusters)))
                distances = satyr.Distances(cluster, len(cluster))

                self.log_debug("Getting dendrogram")
                dendrograms.append(satyr.Dendrogram(distances))

            for dendrogram, cluster in zip(dendrograms, clusters):
                problem = []
                for dups in dendrogram.cut(0.3, 1):
                    reports = set(report_map[cluster[dup]] for dup in dups)
                    problem.append(reports)

                problems.extend(problem)

            for thread in unique_func_threads:
                problems.append(set([report_map[thread]]))

        self.log_info("Creating problems")
        i = 0
        lookedup_count = 0
        found_count = 0
        created_count = 0
        for problem in problems:
            i += 1

            self.log_debug("[{0} / {1}] Creating problem"
                           .format(i, len(problems)))
            comps = {}

            reports_changed = True
            problem_id = reuse_problems.get(
                tuple(sorted([db_report.id for db_report in problem])), None)
            if problem_id is not None:
                db_problem = problems_dict.get(problem_id, None)
                reports_changed = False
                lookedup_count += 1
                self.log_debug("Looked up existing problem #{0}"
                               .format(db_problem.id))
            else:
                db_problem = self._find_problem(db_problems, problem)
                found_count += 1

            if db_problem is None:
                db_problem = Problem()
                db.session.add(db_problem)

                db_problems.append(db_problem)
                created_count += 1

            for db_report in problem:
                db_report.problem = db_problem

                if (db_problem.last_occurrence is None or
                    db_problem.last_occurrence < db_report.last_occurrence):
                    db_problem.last_occurrence = db_report.last_occurrence

                if (db_problem.first_occurrence is None or
                    db_problem.first_occurrence < db_report.first_occurrence):
                    db_problem.first_occurrence = db_report.first_occurrence

                if db_report.component not in comps:
                    comps[db_report.component] = 0

                comps[db_report.component] += 1

            if reports_changed:
                db_comps = sorted(comps, key=lambda x: comps[x], reverse=True)

                order = 0
                for db_component in db_comps:
                    order += 1

                    db_pcomp = get_problem_component(db, db_problem, db_component)
                    if db_pcomp is None:
                        db_pcomp = ProblemComponent()
                        db_pcomp.problem = db_problem
                        db_pcomp.component = db_component
                        db_pcomp.order = order
                        db.session.add(db_pcomp)

        self.log_debug("Total: {0}  Looked up: {1}  Found: {2}  Created: {3}"
                       .format(i, lookedup_count, found_count, created_count))
        self.log_debug("Flushing session")
        db.session.flush()
Exemplo n.º 7
0
    def _create_problems(self, db, problemplugin):
        db_reports = get_reports_by_type(db, problemplugin.name)
        db_problems = get_problems(db)

        # dict to get db_problem by problem_id
        self.log_debug("Creating problem reuse dict")
        problems_dict = {}
        for db_problem in db_problems:
            problems_dict[db_problem.id] = db_problem
        # dict to get report_ids by problem_id
        problem_report = defaultdict(list)
        for db_report in db_reports:
            if db_report.problem_id is not None:
                problem_report[db_report.problem_id].append(db_report.id)
        # create lookup dict for problems
        reuse_problems = {}
        for (problem_id, report_ids) in problem_report.items():
            reuse_problems[tuple(sorted(report_ids))] = problem_id

        invalid_report_ids_to_clean = []
        problems = []
        if len(db_reports) < 1:
            self.log_info("No reports found")
        elif len(db_reports) == 1:
            db_report = db_reports[0]
            if db_report.problem is None:
                problems.append([db_report])
        else:
            report_map = {}
            _satyr_reports = []
            i = 0
            for db_report in db_reports:
                i += 1
                self.log_debug("[{0} / {1}] Loading report #{2}"
                               .format(i, len(db_reports), db_report.id))

                _satyr_report = problemplugin._db_report_to_satyr(db_report)
                if _satyr_report is None:
                    self.log_debug("Unable to create satyr report")
                    if db_report.problem_id is not None:
                        invalid_report_ids_to_clean.append(db_report.id)
                else:
                    _satyr_reports.append(_satyr_report)
                    report_map[_satyr_report] = db_report

                db.session.expire(db_report)

            self.log_debug("Clustering")
            clusters = self._create_clusters(_satyr_reports, 2000)
            unique_func_threads = set(_satyr_reports) - set().union(*clusters)

            dendrograms = []
            i = 0
            for cluster in clusters:
                i += 1
                self.log_debug("[{0} / {1}] Computing distances"
                               .format(i, len(clusters)))
                distances = satyr.Distances(cluster, len(cluster))

                self.log_debug("Getting dendrogram")
                dendrograms.append(satyr.Dendrogram(distances))

            for dendrogram, cluster in zip(dendrograms, clusters):
                problem = []
                for dups in dendrogram.cut(0.3, 1):
                    reports = set(report_map[cluster[dup]] for dup in dups)
                    problem.append(reports)

                problems.extend(problem)

            for thread in unique_func_threads:
                problems.append(set([report_map[thread]]))

        self.log_info("Creating problems from clusters")
        for problem, db_problem, reports_changed in self._iter_problems(
                db, problems, db_problems, problems_dict, reuse_problems):

            comps = {}

            problem_last_occurrence = None
            problem_first_occurrence = None
            for db_report in problem:
                db_report.problem = db_problem

                if (problem_last_occurrence is None or
                    problem_last_occurrence < db_report.last_occurrence):
                    problem_last_occurrence = db_report.last_occurrence

                if (problem_first_occurrence is None or
                    problem_first_occurrence > db_report.first_occurrence):
                    problem_first_occurrence = db_report.first_occurrence

                if db_report.component not in comps:
                    comps[db_report.component] = 0

                comps[db_report.component] += 1

            # In case nothing changed, we don't want to mark db_problem dirty
            # which would cause another UPDATE
            if db_problem.first_occurrence != problem_first_occurrence:
                db_problem.first_occurrence = problem_first_occurrence
            if db_problem.last_occurrence != problem_last_occurrence:
                db_problem.last_occurrence = problem_last_occurrence

            if reports_changed:
                db_comps = sorted(comps, key=lambda x: comps[x], reverse=True)

                order = 0
                for db_component in db_comps:
                    order += 1

                    db_pcomp = get_problem_component(db, db_problem, db_component)
                    if db_pcomp is None:
                        db_pcomp = ProblemComponent()
                        db_pcomp.problem = db_problem
                        db_pcomp.component = db_component
                        db_pcomp.order = order
                        db.session.add(db_pcomp)

        self.log_debug("Removing {0} invalid reports from problems"
                       .format(len(invalid_report_ids_to_clean)))
        for report_id in invalid_report_ids_to_clean:
            db_report = get_report_by_id(db, report_id)
            if db_report is not None:
                db_report.problem_id = None
                db.session.add(db_report)

        self.log_debug("Flushing session")
        db.session.flush()