def xtree(train_df, test_df): """XTREE""" if isinstance(train_df, list): train_df = list2dataframe( train_df) # create a pandas dataframe of training data.dat if isinstance(test_df, list): test_df = list2dataframe( test_df) # create a pandas dataframe of testing data.dat if isinstance(test_df, basestring): test_df = list2dataframe( [test_df]) # create a pandas dataframe of testing data.dat # train_df = SMOTE(train_df, atleast=1000, atmost=1001) tree = pyC45.dtree(train_df) # Create a decision tree patch = Patches(train=None, test=None, trainDF=train_df, testDF=test_df, tree=tree) modified = patch.main() return modified
def transfer_lessons4(): data = DefectData.get_all_projects()["Apache"] for proj, paths in data.iteritems(): if not proj in paths.bellw: res = {proj[:6]: []} "If training data.dat doesn't exist, create it." pred, pred2, distr, distr2 = [], [], [], [] if not "train_bellw" in locals(): train_bellw = list2dataframe(data[paths.bellw].data) train_local = list2dataframe(paths.data[:-1]) test = list2dataframe(paths.data[-1]) patched_local = xtree.execute(train_local, test) patched_bellw = xtree.execute(train_bellw, test) pred, distr = rforest(train_bellw, patched_local) # How good are the patches pred2, distr2 = rforest(train_bellw, patched_bellw) # How good are the predcitions pred3, distr3 = rforest(train_bellw, test) # How good are the predcitions res[proj[:6]].extend(pred_stats(before=test[test.columns[-1]], after=pred3, distr=distr3)) res[proj[:6]].append(impact(test, pred)) res[proj[:6]].append(impact(test, pred2)) yield res
def transfer_lessons3(): data = DefectData.get_all_projects()["Apache"] for proj, paths in data.iteritems(): if not proj in paths.bellw: res = {proj[:6]: []} "If training data.dat doesn't exist, create it." pred, pred2, distr, distr2 = [], [], [], [] if not "train" in locals(): train = list2dataframe(data[paths.bellw].data) test, validation = train_test_split(list2dataframe(paths.data), test_size=0.8) # test = list2dataframe(paths.data.dat[-1]) # validation = list2dataframe(paths.data.dat[:-1]) patched = xtree.execute(train, test) a, b = rforest(train, patched) # How good are the patches aa, bb = rforest(train, test) # How good are the predcitions pred.append(a) pred2.append(aa) distr.append(b) distr2.append(bb) res[proj[:6]].extend(pred_stats(before=test[test.columns[-1]], after=pred2, distr=distr2)) res[proj[:6]].extend(impact(test, pred)) yield res
def planning(): data = get_all_projects(features="processed") results = dict() for proj, paths in data.iteritems(): results.update({proj: []}) for train, test, validation in TrainTestValidate.split(paths.data): "Convert to pandas type dataframe" train = list2dataframe(train) test = list2dataframe(test) validation = list2dataframe(validation) "Recommend changes with XTREE" new = xtree(train[train.columns[1:]], test) """ Have the changes been implemented?" """ "Create a smaller dframe of all closed issues in validation set" closed_in_validation = validation[validation['category'].isin([0])] "Group the smaller dframe and the patched dframe by their file names" modules = list(set(closed_in_validation["Name"].tolist())) heeded = [] for module_name in modules: count = [] module_name_new = new[new["Name"].isin([module_name])] module_name_act = train[train["Name"].isin([module_name])] module_name_val = closed_in_validation[ closed_in_validation["Name"].isin([module_name])] for col_name in module_name_val.columns[1:-1]: aa = module_name_new[col_name] bb = module_name_val[col_name] try: ranges = sorted(eval(aa.values.tolist()[0])) count.append( any([ abs(ranges[0]) <= bbb <= abs(ranges[1]) for bbb in bb.tolist() ])) except TypeError: count.append( any([bbb == aa.values[0] for bbb in bb.tolist()])) except IndexError: pass if len(count) > 0: heeded.append(sum(count) / len(count)) results[proj] = heeded percentiles = np.percentile(results[proj], [25, 50, 75]) print("{}\t{:0.2f}\t{:0.2f}\t{:0.2f}".format(proj[:5], percentiles[0], percentiles[1], percentiles[2])) "Find the deltas between patched and smaller validation dframe"
def changes(data=None): if data is None: data = DefectData.get_all_projects()["Apache"] for proj, paths in data.iteritems(): "Make sure we don't test on the bellwether dataset" if not proj in paths.bellw: res = { proj[:6]: { "xtree_local": [], "xtree_bellw": [], "alves": [], "olive": [], "shatw": [] } } bellw = list2dataframe(data[paths.bellw].data) test = list2dataframe(paths.data) test_local = list2dataframe(paths.data[-1]) train_local = list2dataframe(paths.data[:-1]) for train_bellw, validation in CrossValidation.split(bellw, ways=2): orig = DataFrame([ test.iloc[n].values.tolist() for n in xrange(test.shape[0]) if test.iloc[n][-1] > 0 ], columns=test.columns) patched_alves, changes_alves = alves(train_bellw, test_local) patched_shatw, changes_shatw = shatnawi( train_bellw, test_local) patched_olive, changes_olive = oliveira( train_bellw, test_local) patched_xtree, changes_xtree = xtree(train_bellw, test_local) patched_xtree_local, changes_xtree_local = xtree( train_local, test_local) # How good are the patches from local lessons? res[proj[:6]]["alves"].append( deltas_count(test.columns, changes_alves)) res[proj[:6]]["olive"].append( deltas_count(test.columns, changes_olive)) res[proj[:6]]["shatw"].append( deltas_count(test.columns, changes_shatw)) res[proj[:6]]["xtree_bellw"].append( deltas_count(test.columns, changes_xtree)) res[proj[:6]]["xtree_local"].append( deltas_count(test.columns, changes_xtree_local)) yield res
def xgboost_grid_tuned(train, target): try: source = list2dataframe(train) except IOError: source = train source = SMOTE(source) # Tune with grid search param_grid = { "n_estimators": [80], #, 40, 20], "learning_rate": [0.1], # "max_depth": [4, 6], # "min_samples_leaf": [3, 5, 9, 17], # "max_features": [1.0, 0.3, 0.1] } clf = GradientBoostingClassifier() source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf = GridSearchCV(clf, param_grid).fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] return preds, distr
def rforest_grid_tuned(train, target): clf = RandomForestClassifier(n_estimators=800, max_depth=6, min_samples_leaf=6, max_features=0.33) try: source = list2dataframe(train) except IOError: source = train source = SMOTE(source) # use a full grid over all parameters param_grid = { "max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"] } source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf = GridSearchCV(clf, param_grid).fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] return preds, distr
def transfer_lessons(data=None): if data is None: data = DefectData.get_all_projects()["Apache"] for proj, paths in data.iteritems(): "Make sure we don't test on the bellwether dataset" if not proj in paths.bellw: res = {proj[:6]: { "pd": [], "pf": [], "local": [], "bellw": []} } bellw = list2dataframe(data[paths.bellw].data) for train_bellw, validation in CrossValidation.split(bellw, ways=5): train_local = list2dataframe(paths.data[:-1]) test = list2dataframe(paths.data[-1]) patched_local = xtree(train_local, test) patched_bellw = xtree(train_bellw, list2dataframe(paths.data)) # How good are the patches from local lessons? pred_local, distr_local = xgboost(validation, patched_local) # How good are the patches from the bellwether lessons? pred_bellw, distr_bellw = xgboost(validation, patched_bellw) # How good are the predictions pred_qual, distr_qual = xgboost(validation, test) pred = pred_stats(before=test[test.columns[-1]], after=pred_qual, distr=distr_qual) res[proj[:6]]["pd"].append(pred[0]) res[proj[:6]]["pf"].append(pred[1]) res[proj[:6]]["local"].append(impact(test, pred_local)) res[proj[:6]]["bellw"].append(impact(test, pred_bellw)) yield res
def test_oracles(data=None): if data is None: data = DefectData.get_all_projects()["Apache"] for proj, paths in data.iteritems(): "Make sure we don't test on the bellwether dataset" if not proj in paths.bellw: res = { proj[:6]: { "pd_rf": [], "pf_rf": [], "pd_xg": [], "pf_xg": [] } } pred, pred2, distr, distr2 = [], [], [], [] validate = list2dataframe(paths.data[:-1]) test = list2dataframe(paths.data[-1]) set_trace() # How good are the predictions pred1, distr1 = rforest(validate, test) pred2, distr2 = xgboost(validate, test) pred_rf = pred_stats(before=test[test.columns[-1]], after=pred1, distr=distr1) pred_xg = pred_stats(before=test[test.columns[-1]], after=pred2, distr=distr2) res[proj[:6]]["pd_rf"].append(pred_rf[0]) res[proj[:6]]["pf_rf"].append(pred_rf[1]) res[proj[:6]]["pd_xg"].append(pred_xg[0]) res[proj[:6]]["pf_xg"].append(pred_xg[1]) yield res
def transfer_lessons2(n_folds=1): data = DefectData.get_all_projects()["Apache"] print("Name\tPd\tPf\tImprovement") for proj, paths in data.iteritems(): if not proj in paths.bellw: print(proj[:4], end="\t") "If training data.dat doesn't exist, create it." train, validation = train_test_split(list2dataframe(paths.data), test_size=0.8) test = paths.data[-1] validation = paths.data[:-1] patched = xtree.execute(train, test) test = list2dataframe(test) pred, distr = rforest(validation, patched) # How good are the patches pred2, distr2 = rforest(validation, test) # How good are the predcitions pred_stats(before=test[test.columns[-1]], after=pred2, distr=distr2) impact(test, pred)
def rforest(train, target): clf = RandomForestClassifier(n_estimators=100, random_state=1) try: source = list2dataframe(train) except IOError: source = train source = SMOTE(source) source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] return preds, distr
def xgboost(train, target): try: source = list2dataframe(train) except IOError: source = train # source = SMOTE(source) clf = GradientBoostingClassifier(n_estimators=80, max_depth=6, min_samples_leaf=6, learning_rate=0.085, subsample=True, max_features=0.33) source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False features = source.columns[:-1] klass = list(source[source.columns[-1]]) clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]])[:, 1] preds = [1 if val > 0.77 else 0 for val in distr] return preds, distr
def run_experiment(): data = DefectData.get_all_projects()["Apache"] metrics = list2dataframe(data["ant"].data[-1]).columns for res in changes(data): for key, value in res.iteritems(): print(key) for n, (attr, xtree_local, xtree_bellw, Olive, Alves, Shatw) in \ enumerate( zip(metrics, np.median(value["xtree_local"], axis=0), np.median(value["xtree_bellw"], axis=0), np.median(value["olive"], axis=0), np.median(value["alves"], axis=0), np.median(value["shatw"], axis=0))): print(n, attr[1:], int(xtree_local), int(xtree_bellw), int(Olive), int(Alves), int(Shatw), sep="\t") set_trace()
def oliveira(train, test): """ Implements shatnavi's threshold based planner. :param train: :param test: :param rftrain: :param tunings: :param verbose: :return: """ "Helper Functions" def compliance_rate(k, train_columns): return \ len([t for t in train_columns if t <= k]) / len(train_columns) def penalty_1(p, k, Min, compliance): comply = Min - compliance if comply >= 0: return (Min - comply) / Min else: return 0 def penalty_2(k, Med): if k > Med: return (k - Med) / Med else: return 0 "Compute Thresholds" if isinstance(test, list): test = list2dataframe(test) if isinstance(test, basestring): test = list2dataframe([test]) if isinstance(train, list): train = list2dataframe(train) lo, hi = train.min(), train.max() quantile_array = get_percentiles(train) changes = [] pk_best = dict() for metric in train.columns: min_comply = 10e32 vals = np.empty([10, 100]) for p_id, p in enumerate(np.arange(0, 100, 10)): for k_id, k in enumerate(np.linspace(lo[metric], hi[metric], 100)): med = quantile_array[90][metric] compliance = compliance_rate(k, train[metric]) penalty1 = penalty_1(p, k, compliance=compliance, Min=0.9) penalty2 = penalty_2(k, med) comply_rate_penalty = penalty1 + penalty2 vals[p_id, k_id] = comply_rate_penalty if comply_rate_penalty <= min_comply: min_comply = comply_rate_penalty try: pk_best[metric] = (p, k) except KeyError: pk_best.update({metric: (p, k)}) """ Apply Plans Sequentially """ modified = [] for n in xrange(test.shape[0]): C = Changes() if test.iloc[n][-1] > 0 or test.iloc[n][-1] == True: new_row = apply3(test.iloc[n].values.tolist(), test.columns, pk_best) for name, new, old in zip(test.columns, new_row, test.iloc[n].values.tolist()): C.save(name, new=new, old=old) changes.append(C.log) modified.append(new_row) # Disable the next two line if you're measuring the number of changes. else: if rand() > 0.7: modified.append(test.iloc[n].tolist()) return pd.DataFrame(modified, columns=test.columns), changes
featureTot = 0 information_gain = [] for i in range(0, len(nz[0])): if (i != 0 and nz[0][i] != pre): for notappear in range(pre + 1, nz[0][i]): information_gain.append(0) ig = _calIg() information_gain.append(ig) pre = nz[0][i] classCnt = {} featureTot = 0 featureTot = featureTot + 1 yclass = y[nz[1][i]] if yclass not in classCnt: classCnt[yclass] = 1 else: classCnt[yclass] = classCnt[yclass] + 1 ig = _calIg() information_gain.append(ig) return np.asarray(information_gain) if __name__ == "__main__": data = DefectData.get_all_projects()["Apache"] test_data = list2dataframe(data["ant"].data) indep_var = test_data[test_data.columns[:-1]] depen_var = test_data[test_data.columns[-1]] information_gain(indep_var.values, depen_var.values) set_trace()
deltas_magnitude(orig, patched_alves)) res[proj[:6]]["olive"].append( deltas_magnitude(orig, patched_olive)) res[proj[:6]]["shatw"].append( deltas_magnitude(orig, patched_shatw)) res[proj[:6]]["xtree_bellw"].append( deltas_magnitude(orig, patched_xtree)) res[proj[:6]]["xtree_local"].append( deltas_magnitude(orig, patched_xtree_local)) yield res if __name__ == "__main__": data = DefectData.get_all_projects()["Apache"] metrics = list2dataframe(data["ant"].data[-1]).columns for res in changes(data): for key, value in res.iteritems(): set_trace() print(key) for n, (attr, xtree_local, xtree_bellw, Olive, Alves, Shatw) in \ enumerate( zip(metrics, np.median(value["xtree_local"], axis=0), np.median(value["xtree_bellw"], axis=0), np.median(value["olive"], axis=0), np.median(value["alves"], axis=0), np.median(value["shatw"], axis=0))): print(n, attr[1:], xtree_local,
def transfer_lessons(data=None): if data is None: data = DefectData.get_all_projects()["Apache"] for proj, paths in data.iteritems(): "Make sure we don't test on the bellwether dataset" if proj in ["ant", "ivy", "poi", "jedit"]: res = { proj[:6]: { "xtree_local": [], "xtree_bellw": [], "alves": [], "olive": [], "shatw": [] } } bellw = list2dataframe(data[paths.bellw].data) test = list2dataframe(paths.data) test_local = list2dataframe(paths.data[-1]) train_local = list2dataframe(paths.data[:-1]) for train_bellw, validation in CrossValidation.split(bellw, ways=5): patched_alves = alves(train_bellw, test_local) patched_shatw = shatnawi(train_bellw, test_local) patched_olive = oliveira(train_bellw, test_local) patched_xtree = xtree(train_bellw, test_local) patched_xtree_local = xtree(train_local, test_local) # How good are the patches from Alves? pred_alves, distr_alves = xgboost(validation, patched_alves) # How good are the patches from Shatnawi? pred_shatw, distr_shatw = xgboost(validation, patched_shatw) # How good are the patches from Oliveira? pred_olive, distr_olive = xgboost(validation, patched_olive) # How good are the patches from the bellwether XTREE? pred_xtree, distr_xtree = xgboost(validation, patched_xtree) # How good are the patches from the local XTREE? pred_xtree_local, distr_xtree_local = xgboost( validation, patched_xtree_local) res[proj[:6]]["alves"].append(impact(test, pred_alves)) res[proj[:6]]["shatw"].append(impact(test, pred_shatw)) res[proj[:6]]["olive"].append(impact(test, pred_olive)) res[proj[:6]]["xtree_bellw"].append(impact(test, pred_xtree)) res[proj[:6]]["xtree_local"].append( impact(test, pred_xtree_local)) # Not yet... # # How good are the patches from the bellwether lessons? # pred_fontana, distr_fontana = xgboost(validation, patched_xtree) # # # res[proj[:6]]["fontana"].append(pred[1]) yield res
def shatnawi(train, test): """ Implements shatnavi's threshold based planner. :param train: :param test: :param rftrain: :param tunings: :param verbose: :return: """ "Compute Thresholds" if isinstance(test, list): test = list2dataframe(test) if isinstance(test, basestring): test = list2dataframe([test]) if isinstance(train, list): train = list2dataframe(train) changed = [] metrics = [str[1:] for str in train[train.columns[:-1]]] ubr = LogisticRegression() # Init LogisticRegressor X = train[train.columns[:-1]] # Independent Features (CK-Metrics) y = train[train.columns[-1]] # Dependent Feature (Bugs) ubr.fit(X, y.values.tolist()) # Fit Logit curve inter = ubr.intercept_[0] # Intercepts coef = ubr.coef_[0] # Slopes pVal = f_classif(X, y)[1] # P-Values changes = len(metrics) * [-1] "Find Thresholds using VARL" for Coeff, P_Val, idx in zip(coef, pVal, range(len(metrics))): # xrange(len(metrics)): thresh = VARL(Coeff, inter, p0=0.065) # VARL p0=0.05 (95% CI) if P_Val < 0.05: changes[idx] = thresh # set_trace() """ Apply Plans Sequentially """ modified = [] for n in xrange(test.shape[0]): C = Changes() if test.iloc[n][-1] > 0 or test.iloc[n][-1] == True: new_row = apply2(changes, test.iloc[n].values.tolist()) for name, new, old in zip(test.columns, new_row, test.iloc[n].values.tolist()): C.save(name, new=new, old=old) changed.append(C.log) modified.append(new_row) # Disable the next two line if you're measuring the number of changes. else: if rand() > 0.7: modified.append(test.iloc[n].tolist()) return pd.DataFrame(modified, columns=test.columns), changed
def alves(train, test): if isinstance(test, list): test = list2dataframe(test) if isinstance(test, basestring): test = list2dataframe([test]) if isinstance(train, list): train = list2dataframe(train) metrics = [met[1:] for met in train[train.columns[:-1]]] X = train[train.columns[:-1]] # Independent Features (CK-Metrics) changes = [] """ As weight we will consider the source lines of code (LOC) of the entity. """ tot_loc = train.sum()["$loc"] X = _ent_weight(X, scale=tot_loc) """ Divide the entity weight by the sum of all weights of the same system. """ denom = pd.DataFrame(X).sum().values norm_sum = pd.DataFrame(pd.DataFrame(X).values / denom, columns=metrics) """ Find Thresholds """ y = train[train.columns[-1]] # Dependent Feature (Bugs) pVal = f_classif(X, y)[1] # P-Values cutoff = [] cumsum = lambda vals: [sum(vals[:i]) for i, __ in enumerate(vals)] def point(array): for idx, val in enumerate(array): if val > 0.95: return idx for idx in xrange(len(train.columns[:-1])): # Setup Cumulative Dist. Func. name = metrics[idx] loc = train["$loc"].values vals = norm_sum[name].values sorted_ids = np.argsort(vals) cumulative = [sum(vals[:i]) for i, __ in enumerate(sorted(vals))] cutpoint = point(cumulative) cutoff.append(vals[sorted_ids[cutpoint]] * tot_loc / loc[sorted_ids[cutpoint]] * denom[idx]) """ Apply Plans Sequentially """ modified = [] for n in xrange(test.shape[0]): C = Changes() if test.iloc[n][-1] > 0 or test.iloc[n][-1] is True: new_row = apply2(cutoff, test.iloc[n].values.tolist()) for name, new, old in zip(test.columns, new_row, test.iloc[n].values.tolist()): C.save(name, new=new, old=old) changes.append(C.log) modified.append(new_row) # set_trace() # Disable the next two line if you're measuring the number of changes. else: if rand() > 0.7: modified.append(test.iloc[n].tolist()) return pd.DataFrame(modified, columns=test.columns), changes