Exemplo n.º 1
0
    def __init__(self, model_name, repo_dir, git_repo_dir, method_defect_predictor_dir):
        self.model_name = model_name
        self.repo_dir = repo_dir

        self.model = download_and_load_model(model_name)
        assert self.model is not None

        self.git_repo_dir = git_repo_dir
        if git_repo_dir:
            self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir)

        self.method_defect_predictor_dir = method_defect_predictor_dir
        if method_defect_predictor_dir:
            self.clone_git_repo(
                "https://github.com/lucapascarella/MethodDefectPredictor",
                method_defect_predictor_dir,
                "8cc47f47ffb686a29324435a0151b5fabd37f865",
            )

        if model_name == "regressor":
            self.use_test_history = False

            model_data_X_path = f"{model_name}model_data_X"
            updated = download_check_etag(
                URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst")
            )
            if updated:
                zstd_decompress(model_data_X_path)
            assert os.path.exists(model_data_X_path), "Decompressed X dataset exists"

            model_data_y_path = f"{model_name}model_data_y"
            updated = download_check_etag(
                URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst")
            )
            if updated:
                zstd_decompress(model_data_y_path)
            assert os.path.exists(model_data_y_path), "Decompressed y dataset exists"

            self.X = to_array(joblib.load(model_data_X_path))
            self.y = to_array(joblib.load(model_data_y_path))

            past_bugs_by_function_path = "data/past_bugs_by_function.pickle"
            download_check_etag(
                PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst"
            )
            zstd_decompress(past_bugs_by_function_path)
            assert os.path.exists(past_bugs_by_function_path)
            with open(past_bugs_by_function_path, "rb") as f:
                self.past_bugs_by_function = pickle.load(f)

        if model_name == "testlabelselect":
            self.use_test_history = True
            assert db.download_support_file(
                test_scheduling.TEST_LABEL_SCHEDULING_DB,
                test_scheduling.PAST_FAILURES_LABEL_DB,
            )
            self.past_failures_data = test_scheduling.get_past_failures("label")

            self.testfailure_model = download_and_load_model("testfailure")
            assert self.testfailure_model is not None
Exemplo n.º 2
0
    def __init__(self, model_name, cache_root, git_repo_dir,
                 method_defect_predictor_dir):
        self.model_name = model_name
        self.cache_root = cache_root

        assert os.path.isdir(
            cache_root), f"Cache root {cache_root} is not a dir."
        self.repo_dir = os.path.join(cache_root, "mozilla-central")

        self.model = self.load_model(model_name)
        assert self.model is not None

        self.git_repo_dir = git_repo_dir
        if git_repo_dir:
            self.clone_git_repo("https://github.com/mozilla/gecko-dev",
                                git_repo_dir)

        self.method_defect_predictor_dir = method_defect_predictor_dir
        if method_defect_predictor_dir:
            self.clone_git_repo(
                "https://github.com/lucapascarella/MethodDefectPredictor",
                method_defect_predictor_dir,
                "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd",
            )

        if model_name == "regressor":
            self.use_test_history = False

            model_data_X_path = f"{model_name}model_data_X"
            if not os.path.exists(model_data_X_path):
                download_check_etag(
                    URL.format(model_name=model_name,
                               file_name=f"{model_data_X_path}.zst"))
                zstd_decompress(model_data_X_path)
                assert os.path.exists(
                    model_data_X_path), "Decompressed X dataset exists"

            model_data_y_path = f"{model_name}model_data_y"
            if not os.path.exists(model_data_y_path):
                download_check_etag(
                    URL.format(model_name=model_name,
                               file_name=f"{model_data_y_path}.zst"))
                zstd_decompress(model_data_y_path)
                assert os.path.exists(
                    model_data_y_path), "Decompressed y dataset exists"

            self.X = to_array(joblib.load(model_data_X_path))
            self.y = to_array(joblib.load(model_data_y_path))

        if model_name == "testselect":
            self.use_test_history = True
            assert db.download_support_file(test_scheduling.TEST_SCHEDULING_DB,
                                            test_scheduling.PAST_FAILURES_DB)
            self.past_failures_data = test_scheduling.get_past_failures()

            self.backout_model = self.load_model("backout")
            assert self.backout_model is not None
Exemplo n.º 3
0
    def __init__(self, cache_root, git_repo_dir, method_defect_predictor_dir):
        self.cache_root = cache_root

        assert os.path.isdir(
            cache_root), f"Cache root {cache_root} is not a dir."
        self.repo_dir = os.path.join(cache_root, "mozilla-central")

        regressormodel_path = "regressormodel"
        if not os.path.exists(regressormodel_path):
            download_check_etag(URL.format(f"{regressormodel_path}.zst"),
                                f"{regressormodel_path}.zst")
            zstd_decompress(regressormodel_path)
            assert os.path.exists(
                regressormodel_path), "Decompressed model exists"

        regressormodel_data_X_path = "regressormodel_data_X"
        if not os.path.exists(regressormodel_data_X_path):
            download_check_etag(
                URL.format(f"{regressormodel_data_X_path}.zst"),
                f"{regressormodel_data_X_path}.zst",
            )
            zstd_decompress(regressormodel_data_X_path)
            assert os.path.exists(
                regressormodel_data_X_path), "Decompressed X dataset exists"

        regressormodel_data_y_path = "regressormodel_data_y"
        if not os.path.exists(regressormodel_data_y_path):
            download_check_etag(
                URL.format(f"{regressormodel_data_y_path}.zst"),
                f"{regressormodel_data_y_path}.zst",
            )
            zstd_decompress(regressormodel_data_y_path)
            assert os.path.exists(
                regressormodel_data_y_path), "Decompressed y dataset exists"

        self.model = RegressorModel.load(regressormodel_path)
        self.X = to_array(joblib.load(regressormodel_data_X_path))
        self.y = to_array(joblib.load(regressormodel_data_y_path))

        self.method_defect_predictor_dir = method_defect_predictor_dir
        self.clone_git_repo(
            "https://github.com/lucapascarella/MethodDefectPredictor",
            method_defect_predictor_dir,
            "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd",
        )
        self.git_repo_dir = git_repo_dir
        self.clone_git_repo("https://github.com/mozilla/gecko-dev",
                            git_repo_dir)
Exemplo n.º 4
0
    def print_feature_importances(self,
                                  important_features,
                                  class_probabilities=None):
        feature_names = self.get_human_readable_feature_names()
        # extract importance values from the top features for the predicted class
        # when classifying
        if class_probabilities is not None:
            # shap_values are stored in class 1 for binary classification
            if len(class_probabilities[0]) != 2:
                predicted_class_index = class_probabilities.argmax(axis=-1)[0]
            else:
                predicted_class_index = 0

            predicted_class = self.class_names[predicted_class_index]
            imp_values = important_features["classes"][predicted_class][0]
            shap_val = []
            top_feature_names = []
            for importance, index, is_positive in imp_values:
                if is_positive:
                    shap_val.append("+" + str(importance))
                else:
                    shap_val.append("-" + str(importance))

                feature_value = np.squeeze(
                    to_array(important_features["values"])[:, int(index)])
                top_feature_names.append(
                    f"{feature_names[int(index)]} = {feature_value.round(decimals=5)}"
                )
            shap_val = [[predicted_class] + shap_val]

        # extract importance values from the top features for all the classes
        # when training
        else:
            top_feature_names = [
                feature_names[int(index)]
                for importance, index, is_pos in important_features["average"]
            ]
            shap_val = [[class_name] + imp_values[1] for class_name, imp_values
                        in important_features["classes"].items()]

        # allow maximum of 5 columns in a row to fit the page better
        print("Top {} features:".format(len(top_feature_names)))
        for i in range(0, len(top_feature_names), 5):
            table = []
            for item in shap_val:
                table.append(item[i:i + 5])
            print(
                tabulate(
                    table,
                    headers=(["classes"] + top_feature_names)[i:i + 5],
                    tablefmt="grid",
                ),
                end="\n\n",
            )
Exemplo n.º 5
0
    def classify(
        self,
        items,
        probabilities=False,
        importances=False,
        importance_cutoff=0.15,
        background_dataset=None,
    ):
        assert items is not None
        assert (self.extraction_pipeline is not None and self.clf
                is not None), "The module needs to be initialized first"

        if not isinstance(items, list):
            items = [items]

        assert isinstance(items[0], dict) or isinstance(items[0], tuple)

        X = self.extraction_pipeline.transform(lambda: items)
        if probabilities:
            classes = self.clf.predict_proba(X)
        else:
            classes = self.clf.predict(X)

        classes = self.overwrite_classes(items, classes, probabilities)

        if importances:
            pred_class_index = classes.argmax(axis=-1)[0]
            pred_class = self.le.inverse_transform([pred_class_index])[0]

            if background_dataset is None:
                explainer = shap.TreeExplainer(self.clf)
            else:
                explainer = shap.TreeExplainer(
                    self.clf,
                    to_array(background_dataset(pred_class)),
                    feature_perturbation="interventional",
                )

            shap_values = explainer.shap_values(to_array(X))

            # In the binary case, sometimes shap returns a single shap values matrix.
            if len(classes[0]) == 2 and not isinstance(shap_values, list):
                shap_values = [-shap_values, shap_values]

            important_features = self.get_important_features(
                importance_cutoff, shap_values)
            important_features["values"] = X

            top_indexes = [
                int(index)
                for _, index, _ in important_features["classes"][pred_class][0]
            ]

            feature_names = self.get_human_readable_feature_names()

            feature_legend = {
                str(i + 1): feature_names[feature_i]
                for i, feature_i in enumerate(top_indexes)
            }

            return (
                classes,
                {
                    "importances": important_features,
                    "feature_legend": feature_legend
                },
            )

        return classes
Exemplo n.º 6
0
    def train(self, importance_cutoff=0.15, limit=None):
        classes, self.class_names = self.get_labels()
        self.class_names = sort_class_names(self.class_names)

        # Get items and labels, filtering out those for which we have no labels.
        X_gen, y = split_tuple_generator(lambda: self.items_gen(classes))

        # Extract features from the items.
        X = self.extraction_pipeline.fit_transform(X_gen)

        # Calculate labels.
        y = np.array(y)

        if limit:
            X = X[:limit]
            y = y[:limit]

        print(f"X: {X.shape}, y: {y.shape}")

        is_multilabel = isinstance(y[0], np.ndarray)
        is_binary = len(self.class_names) == 2

        # Split dataset in training and test.
        X_train, X_test, y_train, y_test = self.train_test_split(X, y)
        if self.sampler is not None:
            pipeline = make_pipeline(self.sampler, self.clf)
        else:
            pipeline = self.clf

        tracking_metrics = {}

        # Use k-fold cross validation to evaluate results.
        if self.cross_validation_enabled:
            scorings = ["accuracy"]
            if len(self.class_names) == 2:
                scorings += ["precision", "recall"]

            scores = cross_validate(pipeline,
                                    X_train,
                                    y_train,
                                    scoring=scorings,
                                    cv=5)

            print("Cross Validation scores:")
            for scoring in scorings:
                score = scores[f"test_{scoring}"]
                tracking_metrics[f"test_{scoring}"] = {
                    "mean": score.mean(),
                    "std": score.std() * 2,
                }
                print(
                    f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})"
                )

        print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

        # Training on the resampled dataset if sampler is provided.
        if self.sampler is not None:
            X_train, y_train = self.sampler.fit_resample(X_train, y_train)

            print(
                f"resampled X_train: {X_train.shape}, y_train: {y_train.shape}"
            )

        print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

        self.clf.fit(X_train, y_train)

        print("Model trained")

        feature_names = self.get_human_readable_feature_names()
        if self.calculate_importance and len(feature_names):
            explainer = shap.TreeExplainer(self.clf)
            shap_values = explainer.shap_values(X_train)

            # In the binary case, sometimes shap returns a single shap values matrix.
            if is_binary and not isinstance(shap_values, list):
                shap_values = [-shap_values, shap_values]
                summary_plot_value = shap_values[1]
                summary_plot_type = "layered_violin"
            else:
                summary_plot_value = shap_values
                summary_plot_type = None

            shap.summary_plot(
                summary_plot_value,
                to_array(X_train),
                feature_names=feature_names,
                class_names=self.class_names,
                plot_type=summary_plot_type,
                show=False,
            )

            matplotlib.pyplot.savefig("feature_importance.png",
                                      bbox_inches="tight")
            matplotlib.pyplot.xlabel("Impact on model output")
            matplotlib.pyplot.clf()

            important_features = self.get_important_features(
                importance_cutoff, shap_values)

            self.print_feature_importances(important_features)

            # Save the important features in the metric report too
            feature_report = self.save_feature_importances(
                important_features, feature_names)

            tracking_metrics["feature_report"] = feature_report

        print("Training Set scores:")
        y_pred = self.clf.predict(X_train)
        if not is_multilabel:
            print(
                classification_report_imbalanced(y_train,
                                                 y_pred,
                                                 labels=self.class_names))

        print("Test Set scores:")
        # Evaluate results on the test set.
        y_pred = self.clf.predict(X_test)

        if is_multilabel:
            assert isinstance(
                y_pred[0], np.ndarray), "The predictions should be multilabel"

        print(f"No confidence threshold - {len(y_test)} classified")
        if is_multilabel:
            confusion_matrix = metrics.multilabel_confusion_matrix(
                y_test, y_pred)
        else:
            confusion_matrix = metrics.confusion_matrix(
                y_test, y_pred, labels=self.class_names)

            print(
                classification_report_imbalanced(y_test,
                                                 y_pred,
                                                 labels=self.class_names))
            report = classification_report_imbalanced_values(
                y_test, y_pred, labels=self.class_names)

            tracking_metrics["report"] = report

        print_labeled_confusion_matrix(confusion_matrix,
                                       self.class_names,
                                       is_multilabel=is_multilabel)

        tracking_metrics["confusion_matrix"] = confusion_matrix.tolist()

        confidence_thresholds = [0.6, 0.7, 0.8, 0.9]

        if is_binary:
            confidence_thresholds = [0.1, 0.2, 0.3, 0.4
                                     ] + confidence_thresholds

        # Evaluate results on the test set for some confidence thresholds.
        for confidence_threshold in confidence_thresholds:
            y_pred_probas = self.clf.predict_proba(X_test)
            confidence_class_names = self.class_names + ["__NOT_CLASSIFIED__"]

            y_pred_filter = []
            classified_indices = []
            for i in range(0, len(y_test)):
                if not is_binary:
                    argmax = np.argmax(y_pred_probas[i])
                else:
                    argmax = 1 if y_pred_probas[i][
                        1] > confidence_threshold else 0

                if y_pred_probas[i][argmax] < confidence_threshold:
                    if not is_multilabel:
                        y_pred_filter.append("__NOT_CLASSIFIED__")
                    continue

                classified_indices.append(i)
                if is_multilabel:
                    y_pred_filter.append(y_pred[i])
                else:
                    y_pred_filter.append(argmax)

            if not is_multilabel:
                y_pred_filter = np.array(y_pred_filter)
                y_pred_filter[classified_indices] = self.le.inverse_transform(
                    np.array(y_pred_filter[classified_indices], dtype=int))

            classified_num = sum(1 for v in y_pred_filter
                                 if v != "__NOT_CLASSIFIED__")

            print(
                f"\nConfidence threshold > {confidence_threshold} - {classified_num} classified"
            )
            if is_multilabel:
                confusion_matrix = metrics.multilabel_confusion_matrix(
                    y_test[classified_indices], np.asarray(y_pred_filter))
            else:
                confusion_matrix = metrics.confusion_matrix(
                    y_test.astype(str),
                    y_pred_filter.astype(str),
                    labels=confidence_class_names,
                )
                print(
                    classification_report_imbalanced(
                        y_test.astype(str),
                        y_pred_filter.astype(str),
                        labels=confidence_class_names,
                    ))
            print_labeled_confusion_matrix(confusion_matrix,
                                           confidence_class_names,
                                           is_multilabel=is_multilabel)

        self.evaluation()

        if self.entire_dataset_training:
            print("Retraining on the entire dataset...")

            if self.sampler is not None:
                X_train, y_train = self.sampler.fit_resample(X, y)
            else:
                X_train = X
                y_train = y

            print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")

            self.clf.fit(X_train, y_train)

        with open(self.__class__.__name__.lower(), "wb") as f:
            pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)

        if self.store_dataset:
            with open(f"{self.__class__.__name__.lower()}_data_X", "wb") as f:
                pickle.dump(X, f, protocol=pickle.HIGHEST_PROTOCOL)

            with open(f"{self.__class__.__name__.lower()}_data_y", "wb") as f:
                pickle.dump(y, f, protocol=pickle.HIGHEST_PROTOCOL)

        return tracking_metrics
Exemplo n.º 7
0
    def __init__(
        self,
        model_name: str,
        repo_dir: str,
        git_repo_dir: str,
        method_defect_predictor_dir: str,
        use_single_process: bool,
        skip_feature_importance: bool,
    ):
        self.model_name = model_name
        self.repo_dir = repo_dir

        self.model = Model.load(download_model(model_name))
        assert self.model is not None

        self.git_repo_dir = git_repo_dir
        if git_repo_dir:
            self.clone_git_repo(
                "hg::https://hg.mozilla.org/mozilla-central", git_repo_dir
            )

        self.method_defect_predictor_dir = method_defect_predictor_dir
        if method_defect_predictor_dir:
            self.clone_git_repo(
                "https://github.com/lucapascarella/MethodDefectPredictor",
                method_defect_predictor_dir,
                "8cc47f47ffb686a29324435a0151b5fabd37f865",
            )

        self.use_single_process = use_single_process
        self.skip_feature_importance = skip_feature_importance

        if model_name == "regressor":
            self.use_test_history = False

            model_data_X_path = f"{model_name}model_data_X"
            updated = download_check_etag(
                URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst")
            )
            if updated:
                zstd_decompress(model_data_X_path)
            assert os.path.exists(model_data_X_path), "Decompressed X dataset exists"

            model_data_y_path = f"{model_name}model_data_y"
            updated = download_check_etag(
                URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst")
            )
            if updated:
                zstd_decompress(model_data_y_path)
            assert os.path.exists(model_data_y_path), "Decompressed y dataset exists"

            with open(model_data_X_path, "rb") as fb:
                self.X = to_array(pickle.load(fb))

            with open(model_data_y_path, "rb") as fb:
                self.y = to_array(pickle.load(fb))

            past_bugs_by_function_path = "data/past_fixed_bugs_by_function.json"
            download_check_etag(
                PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst"
            )
            zstd_decompress(past_bugs_by_function_path)
            assert os.path.exists(past_bugs_by_function_path)
            with open(past_bugs_by_function_path, "r") as f:
                self.past_bugs_by_function = json.load(f)

        if model_name == "testlabelselect":
            self.use_test_history = True
            assert db.download_support_file(
                test_scheduling.TEST_LABEL_SCHEDULING_DB,
                test_scheduling.PAST_FAILURES_LABEL_DB,
            )
            self.past_failures_data = test_scheduling.get_past_failures("label", True)

            self.testfailure_model = cast(
                TestFailureModel, TestFailureModel.load(download_model("testfailure"))
            )
            assert self.testfailure_model is not None
Exemplo n.º 8
0
    def classify(
        self,
        items,
        probabilities=False,
        importances=False,
        importance_cutoff=0.15,
        background_dataset=None,
    ):
        assert items is not None
        assert (self.extraction_pipeline is not None and self.clf
                is not None), "The module needs to be initialized first"

        if not isinstance(items, list):
            items = [items]

        assert isinstance(items[0], dict) or isinstance(items[0], tuple)

        X = self.extraction_pipeline.transform(lambda: items)
        if probabilities:
            classes = self.clf.predict_proba(X)
        else:
            classes = self.clf.predict(X)

        classes = self.overwrite_classes(items, classes, probabilities)

        if importances:
            if background_dataset is None:
                explainer = shap.TreeExplainer(self.clf)
            else:
                explainer = shap.TreeExplainer(
                    self.clf,
                    to_array(background_dataset),
                    feature_dependence="independent",
                )
            shap_values = explainer.shap_values(to_array(X))

            important_features = self.get_important_features(
                importance_cutoff, shap_values)
            important_features["values"] = X

            # Workaround: handle multi class case for force_plot to work correctly
            if len(classes[0]) > 2:
                pred_class_index = classes.argmax(axis=-1)[0]
                explainer.expected_value = explainer.expected_value[
                    pred_class_index]
                shap_values = shap_values[pred_class_index]
            else:
                pred_class_index = 0

            pred_class = self.class_names[pred_class_index]
            top_indexes = [
                int(index) for importance, index, is_positive in
                important_features["classes"][pred_class][0]
            ]

            feature_names = self.get_human_readable_feature_names()

            feature_legend = {
                str(i + 1): feature_names[feature_i]
                for i, feature_i in enumerate(top_indexes)
            }

            return (
                classes,
                {
                    "importances": important_features,
                    "feature_legend": feature_legend
                },
            )

        return classes
Exemplo n.º 9
0
    def __init__(
        self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir
    ):
        self.model_name = model_name
        self.cache_root = cache_root

        assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir."
        self.repo_dir = os.path.join(cache_root, "mozilla-central")

        self.model = download_and_load_model(model_name)
        assert self.model is not None

        self.git_repo_dir = git_repo_dir
        if git_repo_dir:
            self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir)

        self.method_defect_predictor_dir = method_defect_predictor_dir
        if method_defect_predictor_dir:
            self.clone_git_repo(
                "https://github.com/lucapascarella/MethodDefectPredictor",
                method_defect_predictor_dir,
                "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd",
            )

        if model_name == "regressor":
            self.use_test_history = False

            model_data_X_path = f"{model_name}model_data_X"
            updated = download_check_etag(
                URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst")
            )
            if updated:
                zstd_decompress(model_data_X_path)
            assert os.path.exists(model_data_X_path), "Decompressed X dataset exists"

            model_data_y_path = f"{model_name}model_data_y"
            updated = download_check_etag(
                URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst")
            )
            if updated:
                zstd_decompress(model_data_y_path)
            assert os.path.exists(model_data_y_path), "Decompressed y dataset exists"

            self.X = to_array(joblib.load(model_data_X_path))
            self.y = to_array(joblib.load(model_data_y_path))

            past_bugs_by_function_path = "data/past_bugs_by_function.pickle"
            download_check_etag(
                PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst"
            )
            zstd_decompress(past_bugs_by_function_path)
            assert os.path.exists(past_bugs_by_function_path)
            with open(past_bugs_by_function_path, "rb") as f:
                self.past_bugs_by_function = pickle.load(f)

        if model_name == "testselect":
            self.use_test_history = True
            assert db.download_support_file(
                test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB
            )
            self.past_failures_data = test_scheduling.get_past_failures()

            self.testfailure_model = download_and_load_model("testfailure")
            assert self.testfailure_model is not None