Exemplo n.º 1
0
def get_top_terms(issues: DataFrame, metric: str) -> dict:
    """Calculates top terms.

    Parameters:
    ----------
    issues:
        Bug reports.
    metric:
        Value which is used for calculations.

    Returns:
    ----------
        Object with calculated terms.
    """
    chi2 = feature_selection.chi2

    sw = get_stop_words(issues)
    tfidf = StemmedTfidfVectorizer(stop_words=sw)
    tfs = tfidf.fit_transform(issues["Description_tr"])

    y = issues[metric]
    selector = SelectKBest(score_func=chi2, k="all")
    selector.fit_transform(tfs, y)

    return dict(zip(tfidf.get_feature_names(), selector.scores_))
Exemplo n.º 2
0
def calculate_frequently_terms(df: pd.DataFrame) -> list:
    """Calculates most frequently used term.

    Parameters
    ----------
    df:
        Bug reports.

    Returns
    -------
        List of the first 100 of the most frequently used terms.
    """
    descriptions = df["Description_tr"]

    sw = get_stop_words(df)
    tfidf = StemmedTfidfVectorizer(stop_words=sw)

    try:
        tfidf.fit_transform(descriptions)
    except ValueError:
        return "Oops! Too little data to calculate."

    idf = tfidf.idf_
    freq_terms = dict(zip(tfidf.get_feature_names(), idf))
    freq_terms = OrderedDict((k, v) for k, v in sorted(
        set(freq_terms.items()), key=lambda x: x[1], reverse=True))
    return list(freq_terms.keys())[:100]
Exemplo n.º 3
0
def train(
    instance: Model,
    issues: DataFrame,
    areas_of_testing: List[str],
    resolution: List[str],
):
    """Train models.

    Parameters:
    ----------
    instance:
        Instance of User model.
    issues:
        Bug reports.
    areas_of_testing:
        areas of testing.
    resolution:
        resolution.
    """
    def _params_producer() -> Tuple[Series, Series, SMOTE, str]:
        """Generates parameters for imbalance training.

        Returns:
        ----------
            Bugs description, classes codes, SMOTE instance and model name.
        """
        for metric in filtered_classes.keys():
            if metric in ["Priority", "Time to Resolve"]:
                filtered_df = issues[issues[metric].isin(
                    filtered_classes[metric])]
                smt = SMOTE(
                    ratio="minority",
                    random_state=0,
                    kind="borderline1",
                    n_jobs=4,
                )
                smt.k_neighbors = get_k_neighbors(filtered_df[metric.lower() +
                                                              "_codes"])
                classes_codes = filtered_df[metric.lower() + "_codes"]
                model_name = metric.split("_")[0]

                yield filtered_df.Description_tr, classes_codes, smt, model_name
            else:
                for class_ in filtered_classes[metric]:
                    df_index = ("Resolution_" +
                                class_ if metric == "Resolution" else class_)
                    smt = SMOTE(
                        ratio="minority",
                        random_state=0,
                        kind="borderline1",
                        n_jobs=4,
                    )
                    smt.k_neighbors = get_k_neighbors(issues[df_index])

                    yield issues.Description_tr, issues[df_index], smt, class_

    issues = issues[(issues["Resolution"] != "Unresolved")
                    & (issues["Resolved"].isna() is not True)
                    & (issues["Resolved"].notnull())]
    issues = issues.reset_index()

    if not check_bugs_count(issues):
        raise LittleDataToAnalyze

    issues = encode_series(issues)
    filtered_classes = filter_classes(issues, areas_of_testing, resolution)

    # TODO: remove unnecessary resolution verification
    # when settings will be linked to data
    missing_resolutions = compare_resolutions(issues, resolution)
    if missing_resolutions:
        raise ResolutionElementsMissed(
            f"Oops! These Resolution elements are missed: {missing_resolutions}. Models can't be trained."
        )

    filtered_elements = (set(resolution).union(
        set(areas_of_testing))).difference(
            set(filtered_classes.get("Resolution")).union(
                set(filtered_classes.get("areas_of_testing"))))

    if filtered_elements and filtered_elements != {"Other"}:
        raise SmallNumberRepresentatives(
            f"Oops! Too little number of class representatives for: {filtered_elements}. Models can't be trained."
        )

    svm_imb = SVC(gamma=2, C=1, probability=True, class_weight="balanced")
    sw = get_stop_words(issues)
    tfidf = StemmedTfidfVectorizer(stop_words=sw)

    try:
        with concurrent.futures.ProcessPoolExecutor() as executor:
            models_and_params = {
                executor.submit(
                    train_imbalance,
                    description,
                    classes,
                    tfidf,
                    smote,
                    chi2,
                    50,
                    svm_imb,
                    model_name,
                )
                for description, classes, smote, model_name in
                _params_producer()
            }
            models = [
                model.result()[0]
                for model in concurrent.futures.as_completed(models_and_params)
            ]
            params = [
                param.result()[1]
                for param in concurrent.futures.as_completed(models_and_params)
            ]
    except ValueError:
        raise InconsistentGivenData

    save_models(user=instance, models=models)

    filtered_classes["Time to Resolve"] = stringify_ttr_intervals(
        filtered_classes["Time to Resolve"])
    filtered_classes["binary"] = [0, 1]

    save_training_parameters(user=instance,
                             classes=filtered_classes,
                             params=params)

    resolutions = [
        "Resolution_" + resol for resol in filtered_classes["Resolution"]
    ]

    save_top_terms(
        user=instance,
        issues=issues,
        resolutions=resolutions,
        priorities=filtered_classes["Priority"],
        areas_of_testing=filtered_classes["areas_of_testing"],
    )
Exemplo n.º 4
0
def train(instance: Model, df: pd.DataFrame, areas_of_testing: list,
          resolution: list) -> dict:
    """ Train models.
    
    Parameters:
    ----------
    instance:
        Instance of User model;
    df:
        Bug reports;
    areas_of_testing:
        areas of testing;
    resolution:
        resolution.

    Returns:
    ----------
        Valid classes.
    """
    def _params_producer() -> tuple:
        """ Generates parameters for imbalance training.

        Returns:
        ----------
            Bugs description, classes codes, SMOTE instance and model name.
        """
        for metric in filtered_classes.keys():
            if metric == "Priority" or metric == "Time to Resolve":
                filtered_df = df[df[metric].isin(filtered_classes[metric])]
                smt = SMOTE(
                    ratio="minority",
                    random_state=0,
                    kind="borderline1",
                    n_jobs=4,
                )
                smt.k_neighbors = get_k_neighbors(df[metric.lower() +
                                                     "_codes"])
                classes_codes = filtered_df[metric.lower() + "_codes"]
                model_name = metric.split("_")[0]

                yield filtered_df.Description_tr, classes_codes, smt, model_name
            else:
                for class_ in filtered_classes[metric]:
                    df_index = ("Resolution_" +
                                class_ if metric == "Resolution" else class_)
                    smt = SMOTE(
                        ratio="minority",
                        random_state=0,
                        kind="borderline1",
                        n_jobs=4,
                    )
                    smt.k_neighbors = get_k_neighbors(df[df_index])

                    yield df.Description_tr, df[df_index], smt, class_

    df = df[(df["Resolution"] != "Unresolved")
            & (df["Resolved"].isna() is not True)
            & (df["Resolved"].notnull())]
    df = df.reset_index()

    if not check_bugs_count(df):
        raise LittleDataToAnalyze

    df = encode_series(df)
    filtered_classes = filter_classes(df, areas_of_testing, resolution)

    # TODO: remove unnecessary resolution verification
    # when settings will be linked to data
    missing_resolutions = compare_resolutions(df, resolution)
    if missing_resolutions:
        raise ResolutionElementsMissed(
            f"Oops! These Resolution elements are missed: {missing_resolutions}. Models can't be trained."
        )

    filtered_resolutions = set(resolution).difference(
        set(filtered_classes.get("Resolution")))
    if filtered_resolutions:
        raise ResolutionElementsMissed(
            f"Oops! These Resolution elements are missed: {filtered_resolutions}. Models can't be trained."
        )

    svm_imb = SVC(gamma=2, C=1, probability=True, class_weight="balanced")

    sw = get_stop_words(df)
    tfidf = StemmedTfidfVectorizer(stop_words=sw)

    try:
        with Pool() as pool:
            models = [
                pool.apply_async(
                    train_imbalance,
                    args=(
                        description,
                        classes,
                        tfidf,
                        smote,
                        chi2,
                        50,
                        svm_imb,
                        model_name,
                    ),
                ) for description, classes, smote, model_name in
                _params_producer()
            ]
            models = [model.get() for model in models]
    except ValueError:
        raise InconsistentGivenData

    save_models(models, instance)

    filtered_classes["Time to Resolve"] = stringify_ttr_intervals(
        filtered_classes["Time to Resolve"])
    filtered_classes["binary"] = [0, 1]

    save_training_parameters(get_models_dir(instance), filtered_classes)

    resolutions = [
        "Resolution_" + resol for resol in filtered_classes["Resolution"]
    ]
    save_top_terms(
        get_models_dir(instance),
        df,
        resolutions,
        filtered_classes["Priority"],
        filtered_classes["areas_of_testing"],
    )