def get_top_terms(issues: DataFrame, metric: str) -> dict: """Calculates top terms. Parameters: ---------- issues: Bug reports. metric: Value which is used for calculations. Returns: ---------- Object with calculated terms. """ chi2 = feature_selection.chi2 sw = get_stop_words(issues) tfidf = StemmedTfidfVectorizer(stop_words=sw) tfs = tfidf.fit_transform(issues["Description_tr"]) y = issues[metric] selector = SelectKBest(score_func=chi2, k="all") selector.fit_transform(tfs, y) return dict(zip(tfidf.get_feature_names(), selector.scores_))
def calculate_frequently_terms(df: pd.DataFrame) -> list: """Calculates most frequently used term. Parameters ---------- df: Bug reports. Returns ------- List of the first 100 of the most frequently used terms. """ descriptions = df["Description_tr"] sw = get_stop_words(df) tfidf = StemmedTfidfVectorizer(stop_words=sw) try: tfidf.fit_transform(descriptions) except ValueError: return "Oops! Too little data to calculate." idf = tfidf.idf_ freq_terms = dict(zip(tfidf.get_feature_names(), idf)) freq_terms = OrderedDict((k, v) for k, v in sorted( set(freq_terms.items()), key=lambda x: x[1], reverse=True)) return list(freq_terms.keys())[:100]
def train( instance: Model, issues: DataFrame, areas_of_testing: List[str], resolution: List[str], ): """Train models. Parameters: ---------- instance: Instance of User model. issues: Bug reports. areas_of_testing: areas of testing. resolution: resolution. """ def _params_producer() -> Tuple[Series, Series, SMOTE, str]: """Generates parameters for imbalance training. Returns: ---------- Bugs description, classes codes, SMOTE instance and model name. """ for metric in filtered_classes.keys(): if metric in ["Priority", "Time to Resolve"]: filtered_df = issues[issues[metric].isin( filtered_classes[metric])] smt = SMOTE( ratio="minority", random_state=0, kind="borderline1", n_jobs=4, ) smt.k_neighbors = get_k_neighbors(filtered_df[metric.lower() + "_codes"]) classes_codes = filtered_df[metric.lower() + "_codes"] model_name = metric.split("_")[0] yield filtered_df.Description_tr, classes_codes, smt, model_name else: for class_ in filtered_classes[metric]: df_index = ("Resolution_" + class_ if metric == "Resolution" else class_) smt = SMOTE( ratio="minority", random_state=0, kind="borderline1", n_jobs=4, ) smt.k_neighbors = get_k_neighbors(issues[df_index]) yield issues.Description_tr, issues[df_index], smt, class_ issues = issues[(issues["Resolution"] != "Unresolved") & (issues["Resolved"].isna() is not True) & (issues["Resolved"].notnull())] issues = issues.reset_index() if not check_bugs_count(issues): raise LittleDataToAnalyze issues = encode_series(issues) filtered_classes = filter_classes(issues, areas_of_testing, resolution) # TODO: remove unnecessary resolution verification # when settings will be linked to data missing_resolutions = compare_resolutions(issues, resolution) if missing_resolutions: raise ResolutionElementsMissed( f"Oops! These Resolution elements are missed: {missing_resolutions}. Models can't be trained." ) filtered_elements = (set(resolution).union( set(areas_of_testing))).difference( set(filtered_classes.get("Resolution")).union( set(filtered_classes.get("areas_of_testing")))) if filtered_elements and filtered_elements != {"Other"}: raise SmallNumberRepresentatives( f"Oops! Too little number of class representatives for: {filtered_elements}. Models can't be trained." ) svm_imb = SVC(gamma=2, C=1, probability=True, class_weight="balanced") sw = get_stop_words(issues) tfidf = StemmedTfidfVectorizer(stop_words=sw) try: with concurrent.futures.ProcessPoolExecutor() as executor: models_and_params = { executor.submit( train_imbalance, description, classes, tfidf, smote, chi2, 50, svm_imb, model_name, ) for description, classes, smote, model_name in _params_producer() } models = [ model.result()[0] for model in concurrent.futures.as_completed(models_and_params) ] params = [ param.result()[1] for param in concurrent.futures.as_completed(models_and_params) ] except ValueError: raise InconsistentGivenData save_models(user=instance, models=models) filtered_classes["Time to Resolve"] = stringify_ttr_intervals( filtered_classes["Time to Resolve"]) filtered_classes["binary"] = [0, 1] save_training_parameters(user=instance, classes=filtered_classes, params=params) resolutions = [ "Resolution_" + resol for resol in filtered_classes["Resolution"] ] save_top_terms( user=instance, issues=issues, resolutions=resolutions, priorities=filtered_classes["Priority"], areas_of_testing=filtered_classes["areas_of_testing"], )
def train(instance: Model, df: pd.DataFrame, areas_of_testing: list, resolution: list) -> dict: """ Train models. Parameters: ---------- instance: Instance of User model; df: Bug reports; areas_of_testing: areas of testing; resolution: resolution. Returns: ---------- Valid classes. """ def _params_producer() -> tuple: """ Generates parameters for imbalance training. Returns: ---------- Bugs description, classes codes, SMOTE instance and model name. """ for metric in filtered_classes.keys(): if metric == "Priority" or metric == "Time to Resolve": filtered_df = df[df[metric].isin(filtered_classes[metric])] smt = SMOTE( ratio="minority", random_state=0, kind="borderline1", n_jobs=4, ) smt.k_neighbors = get_k_neighbors(df[metric.lower() + "_codes"]) classes_codes = filtered_df[metric.lower() + "_codes"] model_name = metric.split("_")[0] yield filtered_df.Description_tr, classes_codes, smt, model_name else: for class_ in filtered_classes[metric]: df_index = ("Resolution_" + class_ if metric == "Resolution" else class_) smt = SMOTE( ratio="minority", random_state=0, kind="borderline1", n_jobs=4, ) smt.k_neighbors = get_k_neighbors(df[df_index]) yield df.Description_tr, df[df_index], smt, class_ df = df[(df["Resolution"] != "Unresolved") & (df["Resolved"].isna() is not True) & (df["Resolved"].notnull())] df = df.reset_index() if not check_bugs_count(df): raise LittleDataToAnalyze df = encode_series(df) filtered_classes = filter_classes(df, areas_of_testing, resolution) # TODO: remove unnecessary resolution verification # when settings will be linked to data missing_resolutions = compare_resolutions(df, resolution) if missing_resolutions: raise ResolutionElementsMissed( f"Oops! These Resolution elements are missed: {missing_resolutions}. Models can't be trained." ) filtered_resolutions = set(resolution).difference( set(filtered_classes.get("Resolution"))) if filtered_resolutions: raise ResolutionElementsMissed( f"Oops! These Resolution elements are missed: {filtered_resolutions}. Models can't be trained." ) svm_imb = SVC(gamma=2, C=1, probability=True, class_weight="balanced") sw = get_stop_words(df) tfidf = StemmedTfidfVectorizer(stop_words=sw) try: with Pool() as pool: models = [ pool.apply_async( train_imbalance, args=( description, classes, tfidf, smote, chi2, 50, svm_imb, model_name, ), ) for description, classes, smote, model_name in _params_producer() ] models = [model.get() for model in models] except ValueError: raise InconsistentGivenData save_models(models, instance) filtered_classes["Time to Resolve"] = stringify_ttr_intervals( filtered_classes["Time to Resolve"]) filtered_classes["binary"] = [0, 1] save_training_parameters(get_models_dir(instance), filtered_classes) resolutions = [ "Resolution_" + resol for resol in filtered_classes["Resolution"] ] save_top_terms( get_models_dir(instance), df, resolutions, filtered_classes["Priority"], filtered_classes["areas_of_testing"], )