def retrieve_labelled_instances(dataset, refactoring: LowLevelRefactoring): """ This method retrieves all the labelled instances for a given refactoring and dataset. It performs the following pipeline: 1. Get all refactored and non refactored instances from the db. 2. Merge them into a single dataset, having 1=true and 0=false, as labels. 3. Removes possible NAs (the data collection process is tough; bad data might had make it through) 4. Shuffles the dataset (good practice) 5. Balances the dataset (if configured) 6. Scales the features values (if configured) 7. Performs feature reduction (if configured) :param dataset: a string containing the name of the dataset to be retrieved :param refactoring: the refactoring object, containing the refactoring to be retrieved :return: features: an array with the features of the instances x: a dataframe with the feature values y: the label (1=true, a refactoring has happened, 0=false, no refactoring has happened) scaler: the scaler object used in the scaling process. """ # get all refactoring examples we have in our dataset refactored_instances = refactoring.get_refactored_instances(dataset) # load non-refactoring examples non_refactored_instances = refactoring.get_non_refactored_instances(dataset) log("raw number of refactoring instances: {}".format(refactored_instances.shape[0])) log("raw number of not refactoring instances: {}".format(non_refactored_instances.shape[0])) # if there' still a row with NAs, drop it as it'll cause a failure later on. refactored_instances = refactored_instances.dropna() non_refactored_instances = non_refactored_instances.dropna() log("refactoring instance (after dropping NA)s: {}".format(refactored_instances.shape[0])) log("not refactoring instances (after dropping NA)s: {}".format(non_refactored_instances.shape[0])) assert refactored_instances.shape[0] > 0, "No refactorings found" # set the prediction variable as true and false in the datasets refactored_instances["prediction"] = 1 non_refactored_instances["prediction"] = 0 # if it's a test run, we reduce the sample randomly if TEST: refactored_instances = refactored_instances.sample(frac=0.1) non_refactored_instances = non_refactored_instances.sample(frac=0.1) # now, combine both datasets (with both TRUE and FALSE predictions) assert non_refactored_instances.shape[1] == refactored_instances.shape[1], "number of columns differ from both datasets" merged_dataset = pd.concat([refactored_instances, non_refactored_instances]) # separate the x from the y (as required by the scikit-learn API) x = merged_dataset.drop("prediction", axis=1) y = merged_dataset["prediction"] # class level refactoring is the only one with process and ownership metrics if USE_PROCESS_AND_AUTHORSHIP_METRICS and not refactoring.refactoring_level() == 'class': x = x.drop(["authorOwnership", "bugFixCount", "linesAdded", "linesDeleted", "qtyMajorAuthors", "qtyMinorAuthors", "qtyOfAuthors", "qtyOfCommits", "refactoringsInvolved"], axis=1) # number of default fields and methods is always 0 # so, remove it from the data x = x.drop(["classNumberOfDefaultFields", "classNumberOfDefaultMethods"], axis=1) # balance the datasets, as we have way more 'non refactored examples' rather than refactoring examples # for now, we basically perform under sampling if BALANCE_DATASET: log("instances before balancing: {}".format(Counter(y))) x, y = perform_balancing(x, y) assert x.shape[0] == y.shape[0], "Balancing did not work, x and y have different shapes." log("instances after balancing: {}".format(Counter(y))) # apply some scaling to speed up the algorithm scaler = None if SCALE_DATASET: x, scaler = perform_scaling(x) # let's reduce the number of features in the set if FEATURE_REDUCTION: x = perform_feature_reduction(x, y) return x.columns.values, x, y, scaler
def retrieve_ordered_labelled_instances(dataset, refactoring: LowLevelRefactoring): """ This method retrieves all the labelled instances for a given refactoring and dataset. It performs the same pipeline as above, but train happens always before test, by ORDERED_DATA_TEST_SPLIT%: :param dataset: a string containing the name of the dataset to be retrieved :param refactoring: the refactoring object, containing the refactoring to be retrieved :return: features: an array with the features of the instances x_train: a dataframe with the feature values y_train: the label (1=true, a refactoring has happened, 0=false, no refactoring has happened) x_test: same, but for test y_test: same, but for test scaler: the scaler object used in the scaling process. """ # get all refactoring examples we have in our dataset refactored_instances = refactoring.get_refactored_instances(dataset) # load non-refactoring examples non_refactored_instances = refactoring.get_non_refactored_instances(dataset) log("raw number of refactoring instances: {}".format(refactored_instances.shape[0])) log("raw number of not refactoring instances: {}".format(non_refactored_instances.shape[0])) # if there' still a row with NAs, drop it as it'll cause a failure later on. refactored_instances = refactored_instances.dropna() non_refactored_instances = non_refactored_instances.dropna() log("refactoring instance (after dropping NA)s: {}".format(refactored_instances.shape[0])) log("not refactoring instances (after dropping NA)s: {}".format(non_refactored_instances.shape[0])) assert refactored_instances.shape[0] > 0, "No refactorings found" # set the prediction variable as true and false in the datasets refactored_instances["prediction"] = 1 non_refactored_instances["prediction"] = 0 # if it's a test run, we reduce the sample randomly if TEST: refactored_instances = refactored_instances.sample(frac=0.1) non_refactored_instances = non_refactored_instances.sample(frac=0.1) # now, combine both datasets (with both TRUE and FALSE predictions) assert non_refactored_instances.shape[1] == refactored_instances.shape[1], "number of columns differ from both datasets" # class level refactoring is the only one with process and ownership metrics if USE_PROCESS_AND_AUTHORSHIP_METRICS and not refactoring.refactoring_level() == 'class': refactored_instances = refactored_instances.drop(["authorOwnership", "bugFixCount", "linesAdded", "linesDeleted", "qtyMajorAuthors", "qtyMinorAuthors", "qtyOfAuthors", "qtyOfCommits", "refactoringsInvolved"], axis=1) non_refactored_instances = non_refactored_instances.drop(["authorOwnership", "bugFixCount", "linesAdded", "linesDeleted", "qtyMajorAuthors", "qtyMinorAuthors", "qtyOfAuthors", "qtyOfCommits", "refactoringsInvolved"], axis=1) # number of default fields and methods is always 0 # so, remove it from the data refactored_instances = refactored_instances.drop(["classNumberOfDefaultFields", "classNumberOfDefaultMethods"], axis=1) non_refactored_instances = non_refactored_instances.drop(["classNumberOfDefaultFields", "classNumberOfDefaultMethods"], axis=1) # splitting both refactored and non refactored instances into train and test # note shuffle = false, as to keep the ordering we get from the database r_x = refactored_instances.drop("prediction", axis=1) r_y = refactored_instances["prediction"] # apply some scaling to speed up the algorithm scaler = None if SCALE_DATASET: r_x, scaler = perform_scaling(r_x) split_line = int(((1.0 - ORDERED_DATA_TEST_SPLIT) * len(r_x))) r_x_train = r_x.iloc[:split_line] r_x_test = r_x.iloc[split_line:] r_y_train = r_y.iloc[:split_line] r_y_test = r_y.iloc[split_line:] # now for the non refactored data nr_x = non_refactored_instances.drop("prediction", axis=1) nr_y = non_refactored_instances["prediction"] if SCALE_DATASET: nr_x = pd.DataFrame(scaler.transform(nr_x), columns=nr_x.columns) split_line_rn = int(((1.0 - ORDERED_DATA_TEST_SPLIT) * len(nr_x))) nr_x_train = nr_x.iloc[:split_line_rn] nr_x_test = nr_x.iloc[split_line_rn:] nr_y_train = nr_y.iloc[:split_line_rn] nr_y_test = nr_y.iloc[split_line_rn:] # combine refactoring and non refactoring data now merged_x_train = pd.concat([r_x_train, nr_x_train]) merged_y_train = pd.concat([r_y_train, nr_y_train]) merged_x_test = pd.concat([r_x_test, nr_x_test]) merged_y_test = pd.concat([r_y_test, nr_y_test]) # balance the datasets, as we have way more 'non refactored examples' rather than refactoring examples # for now, we basically perform under sampling if BALANCE_DATASET: log("train instances before balancing: {}".format(Counter(merged_y_train))) merged_x_train, merged_y_train = perform_balancing(merged_x_train, merged_y_train) assert merged_x_train.shape[0] == merged_y_train.shape[0], "Undersampling did not work, x and y have different shapes." log("train instances after balancing: {}".format(Counter(merged_y_train))) # for the test, we always apply under sampling log("test instances before balancing: {}".format(Counter(merged_y_test))) merged_x_test, merged_y_test = perform_balancing(merged_x_test, merged_y_test, "random") assert merged_x_test.shape[0] == merged_y_test.shape[ 0], "Balancing did not work, x and y have different shapes." log("test instances after balancing: {}".format(Counter(merged_y_test))) # TODO: let's reduce the number of features in the set # if FEATURE_REDUCTION: # x = perform_feature_reduction(x, y) return r_x.columns.values, merged_x_train, merged_y_train, merged_x_test, merged_y_test, scaler
def check_model_performance(refactoring_level, counts_function, get_refactored_function, get_non_refactored_function): log("Starting cross model analysis at " + refactoring_level) counts = counts_function("") for d1 in DATASETS: # d1 being the model we load for d2 in DATASETS: # d2 being the dataset we'll try to predict if d1 == d2 or d1 == '' or d2 == '': continue for refactoring_name in counts["refactoring"].values: refactored_instances = get_refactored_function( refactoring_name, d2) non_refactored_instances = get_non_refactored_function(d2) # if there' still a row with NAs, drop it as it'll cause a failure later on. refactored_instances = refactored_instances.dropna() non_refactored_instances = non_refactored_instances.dropna() # set the prediction variable as true and false in the datasets refactored_instances["prediction"] = 1 non_refactored_instances["prediction"] = 0 merged_dataset = pd.concat( [refactored_instances, non_refactored_instances]) # shuffle the array # (not really necessary, though, as this dataset is entirely for test) merged_dataset = shuffle(merged_dataset) # separate the x from the y (as required by the scikit-learn API) x = merged_dataset.drop("prediction", axis=1) y = merged_dataset["prediction"] # drop process and ownership metrics, if not class level if not refactoring_level == 'class-level': x = x.drop([ "authorOwnership", "bugFixCount", "linesAdded", "linesDeleted", "qtyMajorAuthors", "qtyMinorAuthors", "qtyOfAuthors", "qtyOfCommits", "refactoringsInvolved" ], axis=1) # drop 'default fields' and 'default methods' as # they were not properly collected during the collection phase x = x.drop([ "classNumberOfDefaultFields", "classNumberOfDefaultMethods" ], axis=1) # balance the datasets balanced_x, balanced_y = perform_balancing(x, y) log("instances after balancing: {}".format( Counter(balanced_y))) for model_name in MODELS: try: log("Refactoring %s, model %s, dataset 1 %s, dataset 2 %s" % (refactoring_name, model_name, d1, d2)) # scale it (as in the training of the model) # using the scaler that was generated during training time scaler = load_scaler("models", model_name, d1, refactoring_name) balanced_x_2 = scaler.transform(balanced_x) model_under_eval = load_model("models", model_name, d1, refactoring_name) if model_name == 'deep-learning': y_predicted = model_under_eval.predict_classes( balanced_x_2) else: y_predicted = model_under_eval.predict( balanced_x_2) results = metrics.classification_report( balanced_y, y_predicted, output_dict=True) log(results) log("CSV," + d1 + "," + d2 + "," + refactoring_name + "," + model_name + "," + str(results["macro avg"]["precision"]) + "," + str(results["macro avg"]["recall"])) # TODO: log more info, like the entire confusion matrix except Exception as e: log("An error occurred while working on refactoring " + refactoring_name + " model " + model_name) log(e) log(traceback.format_exc())
def retrieve_labelled_instances(datasets: Iterable[str], refactoring: LowLevelRefactoring, is_training_data: bool = True, scaler=None): """ This method retrieves all the labelled instances for a given refactoring and dataset. It performs the following pipeline: 1. Get all refactored and non refactored instances from the db. 2. Merge them into a single dataset, having 1=true and 0=false, as labels. 3. Removes possible NAs (the data collection process is tough; bad data might had make it through) 4. Shuffles the dataset (good practice) 5. Balances the dataset (if configured) 6. Scales the features values (if configured) :param dataset: a string containing the name of the dataset to be retrieved :param refactoring: the refactoring object, containing the refactoring to be retrieved :param is_training_data: is this training data? If so, :param scaler: a predefined scaler, for this data :return: x: a dataframe with the feature values y: the label (1=true, a refactoring has happened, ƒ0=false, no refactoring has happened) ids: instance ids, to query the actual data from the database scaler: the scaler object used in the scaling process. """ log(f"---- Retrieve labeled instances for dataset: {datasets} and the\ refactoring {refactoring.name()}") # get all refactoring examples we have in our dataset refactored_instances = refactoring.get_refactored_instances(datasets) # load non-refactoring examples non_refactored_instances = refactoring.get_non_refactored_instances( datasets) log(f"raw number of refactoring instances:\ {refactored_instances.shape[0]}") log(f"raw number of non-refactoring with K={refactoring.commit_threshold()}\ instances: {non_refactored_instances.shape[0]}") # if there' still a row with NAs, drop it as it'll cause a failure later # on. refactored_instances = refactored_instances.dropna() non_refactored_instances = non_refactored_instances.dropna() # test if any refactorings were found for the given refactoring type if refactored_instances.shape[0] == 0: log(f"No refactorings found for refactoring type:\ {refactoring.name()}") return None, None, None if non_refactored_instances.shape[0] == 0: log(f"No non-refactorings found for threshold:\ {refactoring.commit_threshold()}") return None, None, None # test if any refactorings were found for the given refactoring type log("refactoring instances (after dropping NA)s: {}".format( refactored_instances.shape[0])) log("non-refactoring instances (after dropping NA)s: {}".format( non_refactored_instances.shape[0])) assert non_refactored_instances.shape[0] > 0, \ "Found no non-refactoring instances for level: " + refactoring.level() # set the prediction variable as true and false in the datasets refactored_instances["prediction"] = 1 non_refactored_instances["prediction"] = 0 # reduce the amount training samples, if specified, also keep the # specified balance if is_training_data and \ 0 < TRAINING_SAMPLE_RATIO < 1 and\ not BALANCE_DATASET: refactored_instances, non_refactored_instances = sample_reduction( refactored_instances, non_refactored_instances, TRAINING_SAMPLE_RATIO) refactored_instances = refactored_instances.drop_duplicates() non_refactored_instances = non_refactored_instances.drop_duplicates() log("refactoring instances (after dropping duplicates)s: {}".format( refactored_instances.shape[0])) log("non-refactoring instances (after dropping duplicates)s: {}".format( non_refactored_instances.shape[0])) # now, combine both datasets (with both TRUE and FALSE predictions) if non_refactored_instances.shape[1] != refactored_instances.shape[1]: raise ImportError("Number of columns differ from both datasets.") merged_dataset = pd.concat( [refactored_instances, non_refactored_instances]) # do we want to try the models without some metrics, e.g. process and # authorship metrics? merged_dataset = merged_dataset.drop(DROP_METRICS, axis=1) # Remove all instances with a -1 value # in the process and authorship metrics, # ToDo: do this after the feature reduction to simplify the query and do # not drop instances which are not affected by faulty process and # authorship metrics, which are not in the feature set if DROP_FAULTY_PROCESS_AND_AUTHORSHIP_METRICS and \ not DROP_PROCESS_AND_AUTHORSHIP_METRICS: log("Instance count before dropping faulty process metrics: {}".format( len(merged_dataset.index))) metrics = [ metric for metric in PROCESS_AND_AUTHORSHIP_METRICS if metric in merged_dataset.columns.values ] query = " and ".join(["%s != -1" % metric for metric in metrics]) merged_dataset = merged_dataset.query(query) log("Instance count after dropping faulty process metrics: {}".format( len(merged_dataset.index))) # separate the x from the y (as required by the scikit-learn API) y = merged_dataset["prediction"] x = merged_dataset.drop("prediction", axis=1) # y = merged_dataset["prediction"] # balance the datasets, as we have way more 'non refactored examples' # rather than refactoring examples # for now, we basically perform under sampling if BALANCE_DATASET: log("instances before balancing: {}".format(Counter(y))) x, y = perform_balancing(x, y) assert x.shape[0] == y.shape[0], "Balancing did not work,\ x and y have different shapes." log("instances after balancing: {}".format(Counter(y))) # shuffle data after balancing it, because some of the samplers order the # data during balancing it # apply some scaling to speed up the algorithm if SCALE_DATASET and scaler is None: x, scaler = perform_fit_scaling(x) elif SCALE_DATASET and scaler is not None: x = perform_scaling(x, scaler) log(f"Got {x.shape[0]} instances with {x.shape[1]}\ features for the dataset: {datasets}\ at threshold {refactoring.commit_threshold()}.") return x, y, scaler
def retrieve_labelled_instances(dataset, refactoring: LowLevelRefactoring, is_training_data: bool = True, scaler=None, allowed_features=None): log("---- Retrieve labeled instances for dataset: %s" % dataset) # get all refactoring examples we have in our dataset refactored_instances = refactoring.get_refactored_instances(dataset) # load non-refactoring examples non_refactored_instances = refactoring.get_non_refactored_instances( dataset) log( "raw number of refactoring instances: {}".format( refactored_instances.shape[0]), False) log( "raw number of non-refactoring instances: {}".format( non_refactored_instances.shape[0]), False) # if there' still a row with NAs, drop it as it'll cause a failure later on. refactored_instances = refactored_instances.dropna() non_refactored_instances = non_refactored_instances.dropna() # test if any refactorings were found for the given refactoring type if refactored_instances.shape[0] == 0: log("No refactorings found for refactoring type: " + refactoring.name()) return None, None, None, None # test if any refactorings were found for the given refactoring type if non_refactored_instances.shape[0] == 0: log("No non-refactorings found for refactoring type: " + refactoring.name()) return None, None, None, None log( "refactoring instances (after dropping NA)s: {}".format( refactored_instances.shape[0]), False) log( "non-refactoring instances (after dropping NA)s: {}".format( non_refactored_instances.shape[0]), False) assert non_refactored_instances.shape[ 0] > 0, "Found no non-refactoring instances for level: " + refactoring.refactoring_level( ) # set the prediction variable as true and false in the datasets refactored_instances["prediction"] = 1 non_refactored_instances["prediction"] = 0 # if it's a test run, we reduce the sample randomly if TEST: refactored_instances = refactored_instances.sample(frac=0.1) non_refactored_instances = non_refactored_instances.sample(frac=0.1) # now, combine both datasets (with both TRUE and FALSE predictions) if non_refactored_instances.shape[1] != refactored_instances.shape[1]: raise ImportError("Number of columns differ from both datasets.") merged_dataset = pd.concat( [refactored_instances, non_refactored_instances]) #just to be sure, shuffle the dataset merged_dataset = merged_dataset.sample(frac=1, random_state=42) # do we want to try the models without some metrics, e.g. process and authorship metrics? merged_dataset = merged_dataset.drop(DROP_METRICS, axis=1) # separate the x from the y (as required by the scikit-learn API) x = merged_dataset.drop("prediction", axis=1) y = merged_dataset["prediction"] # balance the datasets, as we have way more 'non refactored examples' rather than refactoring examples # for now, we basically perform under sampling if is_training_data and BALANCE_DATASET: log("instances before balancing: {}".format(Counter(y)), False) x, y = perform_balancing(x, y) assert x.shape[0] == y.shape[ 0], "Balancing did not work, x and y have different shapes." log("instances after balancing: {}".format(Counter(y)), False) # apply some scaling to speed up the algorithm if SCALE_DATASET and scaler is None: x, scaler = perform_fit_scaling(x) elif SCALE_DATASET and scaler is not None: x = perform_scaling(x, scaler) # let's reduce the number of features in the set if is_training_data and FEATURE_REDUCTION and allowed_features is None: x = perform_feature_reduction(x, y) # enforce the specified feature set elif allowed_features is not None: drop_list = [ column for column in x.columns.values if column not in allowed_features ] x = x.drop(drop_list, axis=1) assert x.shape[1] == len( allowed_features ), "Incorrect number of features for dataset " + dataset #Remove all instances with a -1 value in the process and authorship metrics, after the feature reduction to simplify the query #and do not drop instances which are not affected by faulty process and authorship metrics, which are not in the feature set if DROP_FAULTY_PROCESS_AND_AUTHORSHIP_METRICS and not DROP_PROCESS_AND_AUTHORSHIP_METRICS: log( "Instance count before dropping faulty process metrics: {}".format( len(merged_dataset.index)), False) metrics = [ metric for metric in PROCESS_AND_AUTHORSHIP_METRICS if metric in x.columns.values ] query = " and ".join(["%s != -1" % metric for metric in metrics]) merged_dataset = merged_dataset.query(query) log( "Instance count after dropping faulty process metrics: {}".format( len(merged_dataset.index)), False) log("Got %d instances with %d features for the dataset: %s." % (x.shape[0], x.shape[1], dataset)) return x.columns.values, x, y, scaler