Пример #1
0
def test_import_all_models():
    """ Try loading all defined models to ensure that their full qualified
    names are still good
    """

    for model_name in MODELS.keys():
        print("Try loading model", model_name)
        get_model_class(model_name)
Пример #2
0
def test_import_all_models():
    """Try loading all defined models to ensure that their full qualified
    names are still good
    """

    for model_name in MODELS:
        if model_name == "component_nn" and not importlib.util.find_spec(
                "tensorflow"):
            continue

        print("Try loading model", model_name)
        get_model_class(model_name)
Пример #3
0
    def go(self, model_name):
        # Download datasets that were built by bugbug_data.
        os.makedirs("data", exist_ok=True)

        model_class = get_model_class(model_name)

        if issubclass(model_class, model.BugModel) or issubclass(
            model_class, model.BugCoupleModel
        ):
            self.download_db("bugs")

        if issubclass(model_class, model.CommitModel):
            self.download_db("commits")

        logger.info(f"Training *{model_name}* model")

        model_obj = model_class()
        model_obj.train()

        logger.info(f"Training done")

        model_file_name = f"{model_name}model"
        assert os.path.exists(model_file_name)
        self.compress_file(model_file_name)

        logger.info(f"Model compressed")
Пример #4
0
def main(args):
    if args.goal == "component":
        if args.classifier == "default":
            model_class_name = "component"
        else:
            model_class_name = "component_nn"
    else:
        model_class_name = args.goal

    model_class = get_model_class(model_class_name)

    if args.train:
        db.download(bugzilla.BUGS_DB)
        db.download(repository.COMMITS_DB)

        historical_supported_tasks = [
            "defect",
            "bugtype",
            "defectenhancementtask",
            "regression",
        ]

        if args.goal in historical_supported_tasks:
            model = model_class(args.lemmatization, args.historical)
        elif args.goal == "duplicate":
            model = model_class(
                args.training_set_size, args.lemmatization, args.cleanup_urls
            )
        else:
            model = model_class(args.lemmatization)
        model.train()
Пример #5
0
    def go(self, model_name):
        # Download datasets that were built by bugbug_data.
        os.makedirs("data", exist_ok=True)

        model_class = get_model_class(model_name)
        model_obj = model_class()

        if (isinstance(model_obj, model.BugModel)
                or isinstance(model_obj, model.BugCoupleModel)
                or (hasattr(model_obj, "bug_data") and model_obj.bug_data)):
            db.download(bugzilla.BUGS_DB, force=True)

        if isinstance(model_obj, model.CommitModel):
            db.download(repository.COMMITS_DB, force=True)

        logger.info(f"Training *{model_name}* model")
        metrics = model_obj.train()

        # Save the metrics as a file that can be uploaded as an artifact.
        metric_file_path = "metrics.json"
        with open(metric_file_path, "w") as metric_file:
            json.dump(metrics, metric_file, cls=CustomJsonEncoder)

        logger.info(f"Training done")

        model_file_name = f"{model_name}model"
        assert os.path.exists(model_file_name)
        zstd_compress(model_file_name)

        logger.info(f"Model compressed")
Пример #6
0
def classify_bugs(model_name, classifier, bug_id):
    if classifier != "default":
        assert (
            model_name in MODELS_WITH_TYPE
        ), f"{classifier} is not a valid classifier type for {model_name}"

        model_file_name = f"{model_name}{classifier}model"
        model_name = f"{model_name}_{classifier}"
    else:
        model_file_name = f"{model_name}model"

    if not os.path.exists(model_file_name):
        logger.info(f"{model_file_name} does not exist. Downloading the model....")
        try:
            download_check_etag(
                f"https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_file_name}.zst",
                f"{model_file_name}.zst",
            )
        except requests.HTTPError:
            logger.error(
                f"A pre-trained model is not available, you will need to train it yourself using the trainer script"
            )
            raise SystemExit(1)

        zstd_decompress(model_file_name)
        assert os.path.exists(model_file_name), "Decompressed file doesn't exist"

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    if bug_id:
        bugs = bugzilla.get(bug_id).values()
        assert bugs, f"A bug with a bug id of {bug_id} was not found"
    else:
        bugs = bugzilla.get_bugs()

    for bug in bugs:
        print(
            f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} '
        )

        if model.calculate_importance:
            probas, importance = model.classify(
                bug, probabilities=True, importances=True
            )

            model.print_feature_importances(
                importance["importances"], class_probabilities=probas
            )

            with open("importance.html", "w") as f:
                f.write(importance["html"])
        else:
            probas = model.classify(bug, probabilities=True, importances=False)

        if np.argmax(probas) == 1:
            print(f"Positive! {probas}")
        else:
            print(f"Negative! {probas}")
        input()
Пример #7
0
    def go(self, model_name):
        # Download datasets that were built by bugbug_data.
        os.makedirs("data", exist_ok=True)

        model_class = get_model_class(model_name)

        if issubclass(model_class, model.BugModel) or issubclass(
            model_class, model.BugCoupleModel
        ):
            self.download_db("bugs")

        if issubclass(model_class, model.CommitModel):
            self.download_db("commits")

        logger.info(f"Training *{model_name}* model")

        model_obj = model_class()
        metrics = model_obj.train()

        # Save the metrics as a file that can be uploaded as an artifact.
        metric_file_path = "metrics.json"
        with open(metric_file_path, "w") as metric_file:
            json.dump(metrics, metric_file, cls=CustomJsonEncoder)

        logger.info(f"Training done")

        model_file_name = f"{model_name}model"
        assert os.path.exists(model_file_name)
        self.compress_file(model_file_name)

        logger.info(f"Model compressed")
Пример #8
0
    def go(self, args):
        # Download datasets that were built by bugbug_data.
        os.makedirs("data", exist_ok=True)

        if args.classifier != "default":
            assert (
                args.model in MODELS_WITH_TYPE
            ), f"{args.classifier} is not a valid classifier type for {args.model}"

            model_name = f"{args.model}_{args.classifier}"
        else:
            model_name = args.model

        model_class = get_model_class(model_name)
        if args.model in HISTORICAL_SUPPORTED_TASKS:
            model_obj = model_class(args.lemmatization, args.historical)
        elif args.model == "duplicate":
            model_obj = model_class(args.training_set_size, args.lemmatization,
                                    args.cleanup_urls)
        else:
            model_obj = model_class(args.lemmatization)

        if (isinstance(model_obj, model.BugModel)
                or isinstance(model_obj, model.BugCoupleModel)
                or (hasattr(model_obj, "bug_data") and model_obj.bug_data)):
            if args.download_db:
                db.download(bugzilla.BUGS_DB)
            else:
                logger.info("Skipping download of the bug database")

        if isinstance(model_obj, model.CommitModel):
            if args.download_db:
                db.download(repository.COMMITS_DB)
            else:
                logger.info("Skipping download of the commit database")

        logger.info(f"Training *{model_name}* model")
        metrics = model_obj.train(limit=args.limit)

        # Save the metrics as a file that can be uploaded as an artifact.
        metric_file_path = "metrics.json"
        with open(metric_file_path, "w") as metric_file:
            json.dump(metrics, metric_file, cls=CustomJsonEncoder)

        logger.info(f"Training done")

        model_file_name = f"{model_name}model"
        assert os.path.exists(model_file_name)
        zstd_compress(model_file_name)

        logger.info(f"Model compressed")

        if model_obj.store_dataset:
            assert os.path.exists(f"{model_file_name}_data_X")
            zstd_compress(f"{model_file_name}_data_X")
            assert os.path.exists(f"{model_file_name}_data_y")
            zstd_compress(f"{model_file_name}_data_y")
Пример #9
0
    def load_model(self, model_name):
        model_path = f"{model_name}model"
        if not os.path.exists(model_path):
            download_check_etag(
                URL.format(model_name=model_name,
                           file_name=f"{model_path}.zst"))
            zstd_decompress(model_path)
            assert os.path.exists(model_path), "Decompressed model exists"

        return get_model_class(model_name).load(model_path)
Пример #10
0
    def go(self, args):
        # Download datasets that were built by bugbug_data.
        os.makedirs("data", exist_ok=True)

        if args.classifier != "default":
            assert (
                args.model in MODELS_WITH_TYPE
            ), f"{args.classifier} is not a valid classifier type for {args.model}"

            model_name = f"{args.model}_{args.classifier}"
        else:
            model_name = args.model

        model_class = get_model_class(model_name)
        if args.model in HISTORICAL_SUPPORTED_TASKS:
            model_obj = model_class(args.lemmatization, args.historical)
        elif args.model == "regressor":
            model_obj = model_class(args.lemmatization, args.interpretable)
        elif args.model == "duplicate":
            model_obj = model_class(
                args.training_set_size, args.lemmatization, args.cleanup_urls
            )
        else:
            model_obj = model_class(args.lemmatization)

        if args.download_db:
            for required_db in model_obj.training_dbs:
                assert db.download(required_db)

            if args.download_eval:
                model_obj.download_eval_dbs()
        else:
            logger.info("Skipping download of the databases")

        logger.info(f"Training *{model_name}* model")
        metrics = model_obj.train(limit=args.limit)

        # Save the metrics as a file that can be uploaded as an artifact.
        metric_file_path = "metrics.json"
        with open(metric_file_path, "w") as metric_file:
            json.dump(metrics, metric_file, cls=CustomJsonEncoder)

        logger.info("Training done")

        model_file_name = f"{model_name}model"
        assert os.path.exists(model_file_name)
        zstd_compress(model_file_name)

        logger.info("Model compressed")

        if model_obj.store_dataset:
            assert os.path.exists(f"{model_file_name}_data_X")
            zstd_compress(f"{model_file_name}_data_X")
            assert os.path.exists(f"{model_file_name}_data_y")
            zstd_compress(f"{model_file_name}_data_y")
Пример #11
0
def classify_bugs(model_name: str, classifier: str, bug_id: int) -> None:
    if classifier != "default":
        assert (
            model_name in MODELS_WITH_TYPE
        ), f"{classifier} is not a valid classifier type for {model_name}"

        model_file_name = f"{model_name}{classifier}model"
        model_name = f"{model_name}_{classifier}"
    else:
        model_file_name = f"{model_name}model"

    if not os.path.exists(model_file_name):
        logger.info(
            f"{model_file_name} does not exist. Downloading the model....")
        try:
            download_model(model_name)
        except requests.HTTPError:
            logger.error(
                "A pre-trained model is not available, you will need to train it yourself using the trainer script"
            )
            raise SystemExit(1)

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    if bug_id:
        bugs = bugzilla.get(bug_id).values()
        assert bugs, f"A bug with a bug id of {bug_id} was not found"
    else:
        assert db.download(bugzilla.BUGS_DB)
        bugs = bugzilla.get_bugs()

    for bug in bugs:
        print(
            f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} '
        )

        if model.calculate_importance:
            probas, importance = model.classify(bug,
                                                probabilities=True,
                                                importances=True)

            model.print_feature_importances(importance["importances"],
                                            class_probabilities=probas)
        else:
            probas = model.classify(bug, probabilities=True, importances=False)

        probability = probas[0]
        pred_index = np.argmax(probability)
        if len(probability) > 2:
            pred_class = model.le.inverse_transform([pred_index])[0]
        else:
            pred_class = "Positive" if pred_index == 1 else "Negative"
        print(f"{pred_class} {probability}")
        input()
Пример #12
0
def classify_issues(owner: str, repo: str, retrieve_events: bool,
                    model_name: str, issue_number: int) -> None:

    model_file_name = f"{model_name}model"

    if not os.path.exists(model_file_name):
        logger.info(
            f"{model_file_name} does not exist. Downloading the model....")
        try:
            download_model(model_name)
        except requests.HTTPError:
            logger.error(
                "A pre-trained model is not available, you will need to train it yourself using the trainer script"
            )
            raise SystemExit(1)

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    if issue_number:
        issues = iter([
            github.fetch_issue_by_number(owner, repo, issue_number,
                                         retrieve_events)
        ])
        assert issues, f"An issue with a number of {issue_number} was not found"
    else:
        assert db.download(github.GITHUB_ISSUES_DB)
        issues = github.get_issues()

    for issue in issues:
        print(f'{issue["url"]} - {issue["title"]} ')

        if model.calculate_importance:
            probas, importance = model.classify(issue,
                                                probabilities=True,
                                                importances=True)

            model.print_feature_importances(importance["importances"],
                                            class_probabilities=probas)
        else:
            probas = model.classify(issue,
                                    probabilities=True,
                                    importances=False)

        probability = probas[0]
        pred_index = np.argmax(probability)
        if len(probability) > 2:
            pred_class = model.le.inverse_transform([pred_index])[0]
        else:
            pred_class = "Positive" if pred_index == 1 else "Negative"
        print(f"{pred_class} {probability}")
        input()
Пример #13
0
    def go(self, args):
        # Download datasets that were built by bugbug_data.
        os.makedirs("data", exist_ok=True)

        if args.classifier != "default":
            assert (
                args.model in MODELS_WITH_TYPE
            ), f"{args.classifier} is not a valid classifier type for {args.model}"

            model_name = f"{args.model}_{args.classifier}"
        else:
            model_name = args.model

        model_class = get_model_class(model_name)
        parameter_names = set(
            inspect.signature(model_class.__init__).parameters)
        parameters = {
            key: value
            for key, value in vars(args).items() if key in parameter_names
        }
        model_obj = model_class(**parameters)

        if args.download_db:
            for required_db in model_obj.training_dbs:
                assert db.download(required_db)

            if args.download_eval:
                model_obj.download_eval_dbs()
        else:
            logger.info("Skipping download of the databases")

        logger.info(f"Training *{model_name}* model")
        metrics = model_obj.train(limit=args.limit)

        # Save the metrics as a file that can be uploaded as an artifact.
        metric_file_path = "metrics.json"
        with open(metric_file_path, "w") as metric_file:
            json.dump(metrics, metric_file, cls=CustomJsonEncoder)

        logger.info("Training done")

        model_file_name = f"{model_name}model"
        assert os.path.exists(model_file_name)
        zstd_compress(model_file_name)

        logger.info("Model compressed")

        if model_obj.store_dataset:
            assert os.path.exists(f"{model_file_name}_data_X")
            zstd_compress(f"{model_file_name}_data_X")
            assert os.path.exists(f"{model_file_name}_data_y")
            zstd_compress(f"{model_file_name}_data_y")
Пример #14
0
def generate_sheet(model_name, token, days, threshold):
    model_file_name = f"{model_name}model"

    assert os.path.exists(
        model_file_name
    ), f"{model_file_name} does not exist. Train the model with trainer.py first."

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    today = datetime.utcnow()
    start_date = today - timedelta(days)
    bugzilla.set_token(token)
    bug_ids = bugzilla.get_ids_between(start_date, today)
    bugs = bugzilla.get(bug_ids)

    print(f"Classifying {len(bugs)} bugs...")

    rows = [["Bug", f"{model_name}(model)", model_name, "Title"]]

    for bug in bugs.values():
        p = model.classify(bug, probabilities=True)
        probability = p[0]
        if len(probability) > 2:
            index = np.argmax(probability)
            prediction = model.class_names[index]
        else:
            prediction = "y" if probability[1] >= threshold else "n"

        rows.append(
            [
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
                prediction,
                "",
                bug["summary"],
            ]
        )

    os.makedirs("sheets", exist_ok=True)
    with open(
        os.path.join(
            "sheets",
            f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv',
        ),
        "w",
    ) as f:
        writer = csv.writer(f)
        writer.writerows(rows)
Пример #15
0
def classify_bugs(model_name, classifier):
    if classifier != "default":
        assert (
            model_name in MODELS_WITH_TYPE
        ), f"{classifier} is not a valid classifier type for {model_name}"

        model_file_name = f"{model_name}{classifier}model"
        model_name = f"{model_name}_{classifier}"
    else:
        model_file_name = f"{model_name}model"

    assert os.path.exists(
        model_file_name
    ), f"{model_file_name} does not exist. Train the model with trainer.py first."

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    for bug in bugzilla.get_bugs():
        print(
            f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} '
        )

        if model.calculate_importance:
            probas, importance = model.classify(
                bug, probabilities=True, importances=True
            )

            feature_names = model.get_human_readable_feature_names()

            model.print_feature_importances(
                importance["importances"], feature_names, class_probabilities=probas
            )
        else:
            probas = model.classify(bug, probabilities=True, importances=False)

        if np.argmax(probas) == 1:
            print(f"Positive! {probas}")
        else:
            print(f"Negative! {probas}")
        input()
Пример #16
0
def generate_sheet(model_name, token):
    model_file_name = f"{model_name}model"

    assert os.path.exists(
        model_file_name
    ), f"{model_file_name} does not exist. Train the model with trainer.py first."

    model_class = get_model_class(model_name)
    model = model_class.load(model_file_name)

    today = datetime.utcnow()
    a_week_ago = today - timedelta(7)
    bugzilla.set_token(token)
    bug_ids = bugzilla.get_ids_between(a_week_ago, today)
    bugs = bugzilla.get(bug_ids)

    print(f"Classifying {len(bugs)} bugs...")

    rows = [["Bug", f"{model_name}(model)", model_name, "Title"]]

    for bug in bugs.values():
        p = model.classify(bug, probabilities=True)
        rows.append([
            f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
            "y" if p[0][1] >= 0.7 else "n",
            "",
            bug["summary"],
        ])

    os.makedirs("sheets", exist_ok=True)
    with open(
            os.path.join(
                "sheets",
                f'{model_name}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv',
            ),
            "w",
    ) as f:
        writer = csv.writer(f)
        writer.writerows(rows)
Пример #17
0
    def go(self, model_name):
        # Download datasets that were built by bugbug_data.
        os.makedirs("data", exist_ok=True)

        # Bugs.json
        logger.info("Downloading bugs database")
        bugs_url = BASE_URL.format("bugs")
        urlretrieve(f"{bugs_url}/bugs.json.xz", "data/bugs.json.xz")
        logger.info("Decompressing bugs database")
        self.decompress_file("data/bugs.json")

        logger.info(f"Training *{model_name}* model")

        model_class = get_model_class(model_name)
        model = model_class()
        model.train()

        logger.info(f"Training done")

        model_file_name = f"{model_name}model"
        assert os.path.exists(model_file_name)
        self.compress_file(model_file_name)

        logger.info(f"Model compressed")
Пример #18
0
    args = parser.parse_args()

    model_file_name = "{}{}model".format(
        args.goal, "" if args.classifier == "default" else args.classifier)

    model_class_name = args.goal

    if args.goal == "component":
        if args.classifier == "default":
            model_class_name = "component"
        elif args.classifier == "nn":
            model_class_name = "component_nn"
        else:
            raise ValueError(f"Unknown value {args.classifier}")

    model_class = get_model_class(model_class_name)

    if args.train:
        db.download(bugzilla.BUGS_DB)
        db.download(repository.COMMITS_DB)

        if args.historical:
            model = model_class(args.lemmatization, args.historical)
        else:
            model = model_class(args.lemmatization)
        model.train()
    else:
        model = model_class.load(model_file_name)

    if args.classify:
        for bug in bugzilla.get_bugs():
Пример #19
0
def download_and_load_model(model_name):
    path = download_model(model_name)
    return get_model_class(model_name).load(path)
Пример #20
0
def main(args):
    model_file_name = "{}{}model".format(
        args.goal, "" if args.classifier == "default" else args.classifier)

    if args.goal == "component":
        if args.classifier == "default":
            model_class_name = "component"
        else:
            model_class_name = "component_nn"
    else:
        model_class_name = args.goal

    model_class = get_model_class(model_class_name)

    if args.train:
        db.download(bugzilla.BUGS_DB)
        db.download(repository.COMMITS_DB)

        historical_supported_tasks = [
            "defect",
            "bugtype",
            "defectenhancementtask",
            "regression",
        ]

        if args.goal in historical_supported_tasks:
            model = model_class(args.lemmatization, args.historical)
        elif args.goal == "duplicate":
            model = model_class(args.training_set_size, args.lemmatization)
        else:
            model = model_class(args.lemmatization)
        model.train()
    else:
        model = model_class.load(model_file_name)

    if args.classify:
        for bug in bugzilla.get_bugs():
            print(
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} '
            )

            if model.calculate_importance:
                probas, importance = model.classify(bug,
                                                    probabilities=True,
                                                    importances=True)

                feature_names = model.get_feature_names()
                for i, (importance, index,
                        is_positive) in enumerate(importance["importances"]):
                    print(
                        f'{i + 1}. \'{feature_names[int(index)]}\' ({"+" if (is_positive) else "-"}{importance})'
                    )
            else:
                probas = model.classify(bug,
                                        probabilities=True,
                                        importances=False)

            if np.argmax(probas) == 1:
                print(f"Positive! {probas}")
            else:
                print(f"Negative! {probas}")
            input()

    if args.generate_sheet:
        assert (args.token is not None
                ), "A Bugzilla token should be set in order to download bugs"
        today = datetime.utcnow()
        a_week_ago = today - timedelta(7)
        bugzilla.set_token(args.token)
        bugs = bugzilla.download_bugs_between(a_week_ago, today)

        print(f"Classifying {len(bugs)} bugs...")

        rows = [["Bug", f"{args.goal}(model)", args.goal, "Title"]]

        for bug in bugs:
            p = model.classify(bug, probabilities=True)
            rows.append([
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]}',
                "y" if p[0][1] >= 0.7 else "n",
                "",
                bug["summary"],
            ])

        os.makedirs("sheets", exist_ok=True)
        with open(
                os.path.join(
                    "sheets",
                    f'{args.goal}-{datetime.utcnow().strftime("%Y-%m-%d")}-labels.csv',
                ),
                "w",
        ) as f:
            writer = csv.writer(f)
            writer.writerows(rows)
Пример #21
0
def test_component_is_bugmodel():
    model_class = get_model_class("component")
    assert issubclass(model_class, model.BugModel)
    model_class = get_model_class("regression")
    assert issubclass(model_class, model.BugModel)
Пример #22
0
def test_backout_is_commitmodel():
    model_class = get_model_class("backout")
    assert issubclass(model_class, model.CommitModel)
Пример #23
0
def main(args):
    model_file_name = "{}{}model".format(
        args.goal, "" if args.classifier == "default" else args.classifier)

    if args.goal == "component":
        if args.classifier == "default":
            model_class_name = "component"
        else:
            model_class_name = "component_nn"
    else:
        model_class_name = args.goal

    model_class = get_model_class(model_class_name)

    if args.train:
        db.download(bugzilla.BUGS_DB)
        db.download(repository.COMMITS_DB)

        historical_supported_tasks = [
            "defect",
            "bugtype",
            "defectenhancementtask",
            "regression",
        ]

        if args.goal in historical_supported_tasks:
            model = model_class(args.lemmatization, args.historical)
        elif args.goal == "duplicate":
            model = model_class(args.training_set_size, args.lemmatization,
                                args.cleanup_urls)
        else:
            model = model_class(args.lemmatization)
        model.train()
    else:
        model = model_class.load(model_file_name)

    if args.classify:
        for bug in bugzilla.get_bugs():
            print(
                f'https://bugzilla.mozilla.org/show_bug.cgi?id={ bug["id"] } - { bug["summary"]} '
            )

            if model.calculate_importance:
                probas, importance = model.classify(bug,
                                                    probabilities=True,
                                                    importances=True)

                feature_names = model.get_human_readable_feature_names()

                model.print_feature_importances(importance["importances"],
                                                feature_names,
                                                class_probabilities=probas)
            else:
                probas = model.classify(bug,
                                        probabilities=True,
                                        importances=False)

            if np.argmax(probas) == 1:
                print(f"Positive! {probas}")
            else:
                print(f"Negative! {probas}")
            input()
Пример #24
0
    def __init__(self, model_name, cache_root, git_repo_dir,
                 method_defect_predictor_dir):
        self.model_name = model_name
        self.cache_root = cache_root

        assert os.path.isdir(
            cache_root), f"Cache root {cache_root} is not a dir."
        self.repo_dir = os.path.join(cache_root, "mozilla-central")

        model_path = f"{model_name}model"
        if not os.path.exists(model_path):
            download_check_etag(URL.format(model_name=model_name),
                                f"{model_path}.zst")
            zstd_decompress(model_path)
            assert os.path.exists(model_path), "Decompressed model exists"

        self.model = get_model_class(model_name).load(model_path)

        self.git_repo_dir = git_repo_dir
        if git_repo_dir:
            self.clone_git_repo("https://github.com/mozilla/gecko-dev",
                                git_repo_dir)

        self.method_defect_predictor_dir = method_defect_predictor_dir
        if method_defect_predictor_dir:
            self.clone_git_repo(
                "https://github.com/lucapascarella/MethodDefectPredictor",
                method_defect_predictor_dir,
                "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd",
            )

        if model_name == "regressor":
            self.use_test_history = False

            model_data_X_path = f"{model_name}model_data_X"
            if not os.path.exists(model_data_X_path):
                download_check_etag(
                    URL.format(f"{model_data_X_path}.zst"),
                    f"{model_data_X_path}.zst",
                )
                zstd_decompress(model_data_X_path)
                assert os.path.exists(
                    model_data_X_path), "Decompressed X dataset exists"

            model_data_y_path = f"{model_name}model_data_y"
            if not os.path.exists(model_data_y_path):
                download_check_etag(
                    URL.format(f"{model_data_y_path}.zst"),
                    f"{model_data_y_path}.zst",
                )
                zstd_decompress(model_data_y_path)
                assert os.path.exists(
                    model_data_y_path), "Decompressed y dataset exists"

            self.X = to_array(joblib.load(model_data_X_path))
            self.y = to_array(joblib.load(model_data_y_path))

        if model_name == "testselect":
            self.use_test_history = True
            assert db.download_support_file(test_scheduling.TEST_SCHEDULING_DB,
                                            test_scheduling.PAST_FAILURES_DB)
            self.past_failures_data = test_scheduling.get_past_failures()
Пример #25
0
def parse_args(args):
    description = "Train the models"
    main_parser = argparse.ArgumentParser(description=description)

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument(
        "--limit",
        type=int,
        help="Only train on a subset of the data, used mainly for integrations tests",
    )
    parser.add_argument(
        "--no-download",
        action="store_false",
        dest="download_db",
        help="Do not download databases, uses whatever is on disk",
    )
    parser.add_argument(
        "--download-eval",
        action="store_true",
        dest="download_eval",
        help="Download databases and database support files required at runtime (e.g. if the model performs custom evaluations)",
    )
    parser.add_argument(
        "--lemmatization",
        help="Perform lemmatization (using spaCy)",
        action="store_true",
    )
    parser.add_argument(
        "--classifier",
        help="Type of the classifier. Only used for component classification.",
        choices=["default", "nn"],
        default="default",
    )

    subparsers = main_parser.add_subparsers(title="model", dest="model", required=True)

    for model_name in MODELS:
        subparser = subparsers.add_parser(
            model_name, parents=[parser], help=f"Train {model_name} model"
        )

        try:
            model_class_init = get_model_class(model_name).__init__
        except ImportError:
            continue

        for parameter in inspect.signature(model_class_init).parameters.values():
            if parameter.name == "self":
                continue

            # Skip parameters handled by the base class (TODO: add them to the common argparser and skip them automatically without hardcoding by inspecting the base class)
            if parameter.name == "lemmatization":
                continue

            parameter_type = parameter.annotation
            if parameter_type == inspect._empty:
                parameter_type = type(parameter.default)
            assert parameter_type is not None

            if parameter_type == bool:
                subparser.add_argument(
                    f"--{parameter.name}"
                    if parameter.default is False
                    else f"--no-{parameter.name}",
                    action="store_true"
                    if parameter.default is False
                    else "store_false",
                    dest=parameter.name,
                )
            else:
                subparser.add_argument(
                    f"--{parameter.name}",
                    default=parameter.default,
                    dest=parameter.name,
                    type=int,
                )

    return main_parser.parse_args(args)