def qualitative_looking(path_correctId, path_label): id_root = load_file(path_file=path_correctId) print len(id_root) data_label = load_file(path_file=path_label) id_label = [d.split("\t")[0] for d in data_label] print len(data_label) index_id_root = [id_label.index(i) for i in id_root] for id_ in id_root: path_id = "./qualitative_analysis/cosine_sim/" + id_ + ".txt" cosine_data = load_file(path_file=path_id) print len(cosine_data) cosine_data = map(float, cosine_data) order_cosine = sorted(cosine_data, key=float, reverse=True) write_data = list() write_data = dict() for jid in index_id_root: name_id = id_label[jid] cosine_score = cosine_data[jid] position_ = order_cosine.index(cosine_score) # print name_id + "\t" + str(cosine_score) + "\t" + str(position_ + 1) # write_data.append(name_id + "\t" + str(cosine_score) + "\t" + str(position_ + 1)) write_data[name_id] = position_ + 1 new_write_data = list() for w in sorted(write_data, key=write_data.get): print w, write_data[w] new_write_data.append(w + "\t" + str(write_data[w])) path_write = "./qualitative_analysis/cosine_sim_order/" + id_ + ".txt" write_file(path_file=path_write, data=new_write_data)
def loading_testing_data(): data, paths = list(), list() paths.append("./data/test_data/features_markusinfo.txt") # paths.append("./data/test_data/features_nicholaskinfo.txt") # paths.append("./data/test_data/features_sashainfo.txt") for p in paths: data_ = load_file(path_file=p) data += data_ ids_, X_ = load_data_ICSE_new(data=data) print len(ids_), X_.shape data_gt, path_gt = list(), list() # path_gt.append("./data/test_data/markus_translated.out") # path_gt.append("./data/test_data/nicholask_translated.out") path_gt.append("./data/test_data/sasha_translated.out") print path_gt for p in path_gt: p_data = load_file(path_file=p) data_gt += p_data commits = extract_commit_new(commits=data_gt) nfile, nhunk, nline, nleng = 1, 8, 10, 120 commits_ = get_commits(commits=filtering_commit(commits=commits, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng), ids=ids_) ids_index = [ids_.index(c["id"]) for c in commits_] ids_ = [ids_[i] for i in ids_index] X_ = X_[ids_index, :] y_ = [1 if c["stable"] == "true" else 0 for c in commits_] return commits_, ids_, X_, np.array(y_)
def collect_labels(path_data, path_label): valid_ids = get_ids( [f for f in listdir(path_data) if isfile(join(path_data, f))]) ids, labels = [ l.split('\t')[0] for l in load_file(path_file=path_label) ], [l.split('\t')[1] for l in load_file(path_file=path_label)] labels_valid_ids = [ labels[ids.index(v_id)] for v_id in valid_ids if v_id in ids ] return valid_ids, labels_valid_ids
def finding_id(path_label, path_root): data_label = load_file(path_file=path_label) id_label = [d.split("\t")[0] for d in data_label] gt_label = [1 if d.split("\t")[1] == "true" else 0 for d in data_label] data_pred = load_file(path_file=path_root) label_pred = [float(d) for d in data_pred] id_correct = list() for i in xrange(len(id_label)): if gt_label[i] == label_pred[i] and gt_label[i] == 0: id_correct.append(id_label[i]) return id_correct
def load_probability_score(model, threshold): path_file = "./statistical_test_prob_ver3/%s.txt" % model if model == "sasha_results": y_pred = load_sasha_results_ver2(path_file=path_file, threshold=threshold) elif model == "true_label": y_pred = load_file(path_file=path_file) y_pred = np.array([float(y) for y in y_pred]) else: y_pred = load_file(path_file=path_file) y_pred = np.array([float(y) for y in y_pred]) y_pred[y_pred > 0.5] = 1 y_pred[y_pred <= 0.5] = 0 return y_pred
def get_all_checkpoints(checkpoint_dir): files = load_file(checkpoint_dir + "/checkpoint") files = files[1:] dirs = [] for f in files: dirs.append(get_checkpoint_directory(directory=f)) return dirs
def get_commit_satisfy_condition(path_data_, nfile, nhunk, nline, nleng): commits_structure = extract_commit_july(path_file=path_data_) # nfile, nhunk, nline, nleng = 1, 8, 10, 120 filter_commits = filtering_commit_union(commits=commits_structure, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng) print len(commits_structure), len(filter_commits) commits = load_file(path_data_) indexes = commits_index(commits=commits) new_commits = list() for i in xrange(0, len(indexes)): if i == len(indexes) - 1: id = commit_id(commit=commits[indexes[i]:]) if id in filter_commits: new_commits += commits[indexes[i]:] else: id = commit_id(commit=commits[indexes[i]:indexes[i + 1]]) if id in filter_commits: new_commits += commits[indexes[i]:indexes[i + 1]] print i, id # write_file("./satisfy_typediff_sorted.out", new_commits) write_file(path_data_ + ".satisfy", new_commits)
def load_sasha_results(path_file, threshold): y_pred = load_file(path_file=path_file) y_pred = [float(y) for y in y_pred] max_value = sorted(y_pred, reverse=True)[int(len(y_pred) * (threshold - 0.05))] y_pred = [1 if y > max_value else 0 for y in y_pred] return np.array(y_pred)
def draw_prc_recall_curve(y_true, path_file, point): data = load_file(path_file=path_file) data = np.array([float(y) for y in data]) prc, rc, threshold = metrics.precision_recall_curve(y_true, data) new_prc, new_rc = list(), list() for i in xrange(0, len(prc), int(len(prc) / point)): new_prc.append(prc[i]) new_rc.append(rc[i]) return new_prc[:point], new_rc[:point]
def load_data_ICSE(path): data = load_file(path_file=path) ids, ftrs, labels = list(), list(), list() for d in data: split_ = d.split(",") id_, ftr_ = split_[0], map(int, split_[1:len(split_) - 1]) label_ = split_[len(split_) - 1] ids.append(id_) ftrs.append(np.array(ftr_)) labels.append(label_) labels = [1 if v.strip() == "true" else 0 for v in labels] return ids, np.array(ftrs), np.array(labels)
def balance_data_ICSE(path): data = load_file(path_file=path) new_data, cnt = list(), 0 for d in data: if "true" in d and cnt <= 11165: new_data.append(d.strip()) cnt += 1 elif "false" in d: new_data.append(d.strip()) shuffle(new_data) write_file(path_file="./data/3_mar7/new_features_ver1.txt", data=new_data) exit()
def evaluation_metrics(path, labels): pred_score = load_file(path_file=path) pred_score = np.array([float(score) for score in pred_score]) labels = labels[:pred_score.shape[0]] acc = accuracy_score(y_true=labels, y_pred=convert_to_binary(pred_score)) prc = precision_score(y_true=labels, y_pred=convert_to_binary(pred_score)) rc = recall_score(y_true=labels, y_pred=convert_to_binary(pred_score)) f1 = f1_score(y_true=labels, y_pred=convert_to_binary(pred_score)) auc = roc_auc_score(y_true=labels, y_score=pred_score) print('Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc))
def load_commit_code(path_file): code = load_file(path_file=path_file) indexes = diff_file_index(code=code) diffs = list() for i in range(0, len(indexes)): dict = {} if i == len(indexes) - 1: file, diff = diff_code(code[indexes[i]:]) else: file, diff = diff_code(code[indexes[i]:indexes[i + 1]]) dict['file'] = file dict['diff'] = diff diffs.append(dict) return diffs
def cosine_similarity(path_root, id_commit, data): vectorizer = CountVectorizer() X = vectorizer.fit_transform(data) data_root = load_file(path_file=path_root) for id_root in data_root: results = list() index_ = id_commit.index(id_root) X_root = X[index_, :].toarray().flatten() for i in xrange(len(id_commit)): results.append( 1 - spatial.distance.cosine(X_root, X[i, :].toarray().flatten())) write_file(path_file="./qualitative_analysis/cosine_sim/" + id_root + ".txt", data=results) return None
def load_df_yasu_data(path_data, path_file): data = pd.read_csv(path_data) data = replace_value_dataframe(df=data) ids, labels, features = get_ids(data=data), get_label(data=data), get_features(data=data) indexes, new_ids, new_labels, new_features = list(), list(), list(), list() cnt_noexits = 0 for i in range(0, len(ids)): try: data = load_file(path_file=path_file + '/' + ids[i] + '.diff') indexes.append(i) except FileNotFoundError: print('File commit id no exits', ids[i], cnt_noexits) cnt_noexits += 1 ids = [ids[i] for i in indexes] labels = [labels[i] for i in indexes] features = features[indexes] return (ids, np.array(labels), features)
def load_results(id_gt, label_gt, single_file, threshold): lines = load_file(single_file) patches = dict() for l in lines: # patch = dict() split_l = l.split() # patch["id"], patch["score"] = split_l[0], float(split_l[1]) # patches.append(patch) patches[split_l[0]] = float(split_l[1]) patches = sorted(patches.items(), key=lambda x: x[1], reverse=True) # acc, prc, rc, f1, auc = checking_performance(id_gt=id_gt, label_gt=label_gt, patches=patches) # acc, prc, rc, f1, auc, prob, true_positive, false_negative = checking_performance_v2(id_gt=id_gt, label_gt=label_gt, # patches=patches) acc, prc, rc, f1, auc, prob, true_positive, false_negative = checking_performance_v3( id_gt=id_gt, label_gt=label_gt, patches=patches, threshold=threshold) return acc, prc, rc, f1, auc, prob, true_positive, false_negative
def restruct_root(roots, path, type): min_files = min([len(r) for r in roots]) new_roots = [r[:min_files] for r in roots] print len(new_roots), len(new_roots[0]) for i in xrange(0, len(new_roots[0])): model = list() for j in xrange(0, len(new_roots)): print path + "/" + new_roots[j][i] model += load_file(path + "/" + new_roots[j][i]) model_name = "model-" + new_roots[j][i].split("-")[-1].replace( ".txt", "") print type, model_name # exit() path_write = "./patchNet_mergeResults/%s_%s.txt" % (type, model_name) write_file(path_file=path_write, data=model) return None
def get_commit_id_and_date(path_data_): commits = load_file(path_data_) indexes = commits_index(commits=commits) dicts = {} for i in xrange(0, len(indexes)): if i == len(indexes) - 1: date = commit_date_july(commit=commits[indexes[i]:]) else: date = commit_date_july(commit=commits[indexes[i]:indexes[i + 1]]) dicts[i] = int(date) sort_dicts = sorted(dicts.items(), key=operator.itemgetter(1)) new_commits = list() for d in sort_dicts: index, date = d[0], d[1] print index, date if index == len(sort_dicts) - 1: new_commits += commits[indexes[index]:] else: new_commits += commits[indexes[index]:indexes[index + 1]] # write_file("./typediff_sorted.out", new_commits) write_file(path_data_ + ".sorted", new_commits)
def collect_labels_ver2(path_label): ids, labels = [l.split('\t')[0] for l in load_file(path_file=path_label)], [l.split('\t')[1] for l in load_file(path_file=path_label)] return ids, labels
def checking_performance(id_label, true_label, model_label, model_name): for i in range(5, 90, 1): if model_name == "patchNet": threshold = 1 - i / float(100) elif model_name == "sasha": threshold = 0 threshold_label = [1 if m >= threshold else 0 for m in model_label] prc = precision_score(y_true=true_label, y_pred=threshold_label) rc = recall_score(y_true=true_label, y_pred=threshold_label) print threshold, prc, rc exit() if __name__ == "__main__": path_data = "./satisfy_typediff_sorted.out" commits_ = extract_commit_july(path_file=path_data) print len(commits_), type(commits_) commits_id = [c["id"] for c in commits_] print len(commits_id) path_file = "./statistical_test_prob_ver2/true_label.txt" true_label = load_file(path_file=path_file) true_label = [float(t) for t in true_label] path_file = "./statistical_test_prob_ver2/PatchNet.txt" patchNet = load_file(path_file=path_file) patchNet = [float(t) for t in patchNet] checking_performance(id_label=commits_id, true_label=true_label, model_label=patchNet, model_name="patchNet")
def load_commit_msg(path_file): msg = ' '.join(load_file(path_file=path_file)) new_msg = dict() new_msg['title'], new_msg['desc'] = re.compile('<title>(.*?)</title>', re.DOTALL).findall(msg)[0], \ re.compile('<message>(.*?)</message>', re.DOTALL).findall(msg)[0] return new_msg
if __name__ == "__main__": path_data = "./data/test_data/merging_markus_sasha.txt" commits_ = extract_commit(path_file=path_data) nfile, nhunk, nline, nleng = 1, 8, 10, 120 filter_commits = filtering_commit(commits=commits_, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng) ids_ = [c["id"] for c in filter_commits] labels_ = [1 if c["stable"] == "true" else 0 for c in filter_commits] path_ftr = "./data/test_data/features_merging_markus_sasha.txt" ftr = load_file(path_file=path_ftr) new_ftr = clean_merging_data(ids=ids_, ftrs=ftr) commits_test, ids_test, X_ftr_test, y_test = loading_testing_data( ftr_data=new_ftr, commit_data=filter_commits) commits_train, ids_train, X_ftr_train, y_train = loading_training_data() # type = "msg" # type = "code" type = "msg_code" print type # if type == "msg": # X_msg_train, X_msg_test = create_features_ICSE_new(commits_train=commits_train, ids_train=ids_train, # commits_test=commits_test, ids_test=ids_test, type=type) # X_train = X_msg_train # X_test = X_msg_test
print len(filter_commits), type(filter_commits) commits_id = [c["id"] for c in commits_] print len(commits_id) # load_model_labels(id=commits_id) msgs = extract_msg(commits=filter_commits) labels = extract_label(commits=filter_commits) codes = extract_code(commits=filter_commits) all_lines = add_two_list(list1=msgs, list2=codes) vectorizer = CountVectorizer() X = vectorizer.fit_transform(all_lines) print X.shape # exit() path_good_commits = "./statistical_test_prob_ver3/good_commits.txt" good_commits = load_file(path_file=path_good_commits) print "Leng of good commits: %s" % (str(len(good_commits))) write_data = [] for g in good_commits: write_data += similarity_good_commit(id=commits_id, root=g, all=X, top_k=50) # break path_write = "./statistical_test_prob_ver2/good_commits_results.txt" write_file(path_file=path_write, data=write_data) # exit() #################################################################################### #################################################################################### path_bad_commits = "./statistical_test_prob_ver2/bad_commits.txt"
from ultis import load_file, extract_commit_new from baselines import extract_msg, extract_code, add_two_list, extract_label import numpy as np if __name__ == "__main__": path_test = list() path_test.append("./data/test_data/markus_translated.out") path_test.append("./data/test_data/nicholask_translated.out") path_test.append("./data/test_data/sasha_translated.out") path_dict = "./data/3_mar7/newres.simplified.dict" dict_index = load_file(path_file=path_dict) new_dict = {} for d in dict_index: split_d = d.strip().split(":") new_dict[int(split_d[0])] = split_d[1] data = list() for p in path_test: p_data = load_file(path_file=p) data += p_data commits_ = extract_commit_new(commits=data) msgs = extract_msg(commits=commits_) codes = extract_code(commits=commits_) all_lines = add_two_list(list1=msgs, list2=codes) labels = extract_label(commits=commits_) # pos_label = len([1 for l in labels if l == 1]) # neg_label = len([0 for l in labels if l == 0]) print len(labels), np.count_nonzero(np.array(labels))
def draw_roc_curve(path_file): data = load_file(path_file=path_file) data = np.array([float(y) for y in data]) fpr, tpr, threshold = metrics.roc_curve(y_true, data) roc_auc = metrics.auc(fpr, tpr) return fpr, tpr, roc_auc
def draw_prc_recall_curve(y_true, path_file, point): data = load_file(path_file=path_file) data = np.array([float(y) for y in data]) prc, rc, threshold = metrics.precision_recall_curve(y_true, data) new_prc, new_rc = list(), list() for i in xrange(0, len(prc), int(len(prc) / point)): new_prc.append(prc[i]) new_rc.append(rc[i]) return new_prc[:point], new_rc[:point] if __name__ == "__main__": path_true = "./statistical_test_prob/true_label.txt" y_true = load_file(path_file=path_true) y_true = np.array([int(y) for y in y_true]) path_sasha = "./statistical_test_prob_ver3/sasha_results.txt" fpr_sasha, tpr_sasha, roc_auc_sasha = draw_roc_curve(path_file=path_sasha) path_PatchNet = "./statistical_test_prob_ver3/PatchNet.txt" fpr_PatchNet, tpr_PatchNet, roc_auc_PatchNet = draw_roc_curve( path_file=path_PatchNet) # path_lstm = "./statistical_test_prob/lstm_cnn_all.txt" # fpr_lstm, tpr_lstm, roc_auc_lstm = draw_roc_curve(path_file=path_lstm) # # path_cnn = "./statistical_test_prob/cnn_all.txt" # fpr_cnn, tpr_cnn, roc_auc_cnn = draw_roc_curve(path_file=path_cnn) #
def print_false_negative(id, y_pred, threshold, y_true): y_pred = [1 if float(y) > threshold else 0 for y in y_pred] false_negative = [] for i, p, t in zip(id, y_pred, y_true): if p == 0 and t == 1: false_negative.append(i) print len(false_negative) path_write = "./sasha_results/false_neg_%s.txt" % (str(threshold)) write_file(path_file=path_write, data=false_negative) if __name__ == "__main__": path_data = "./newres_funcalls_jul28.out.sorted.satisfy" commits_structure = extract_commit_july(path_file=path_data) commits_id = [c["id"] for c in commits_structure] path_true = "./statistical_test_prob_ver3/true_label.txt" y_true = load_file(path_file=path_true) y_true = [int(y) for y in y_true] path_pred, threshold = "./statistical_test_prob_ver3/sasha_results.txt", 50 y_pred = load_file(path_file=path_pred) print_true_positive(id=commits_id, y_pred=y_pred, threshold=threshold, y_true=y_true) print_false_negative(id=commits_id, y_pred=y_pred, threshold=threshold, y_true=y_true)
new_dict = {} for i in index_: new_dict[i] = dictionary[i] new_list = list() for key, value in sorted(new_dict.iteritems()): new_list.append(str(key) + ": " + value) return new_list if __name__ == "__main__": path_data = "./data/3_mar7/typediff.out" commits_ = extract_commit(path_file=path_data) msgs = extract_msg(commits=commits_) codes = extract_code(commits=commits_) all_lines = add_two_list(list1=msgs, list2=codes) print len(all_lines), len(commits_), len(msgs), len(codes) index = create_dict(all_lines) print len(index) path_dict = "./data/3_mar7/newres.dict" dict_index = load_file(path_file=path_dict) new_dict = {} for d in dict_index: split_d = d.strip().split(":") new_dict[int(split_d[0])] = split_d[1] print len(new_dict) new_dict = mapping_dict(index_=index, dictionary=new_dict) path_write = "./data/3_mar7/newres.simplified.dict" write_file(path_file=path_write, data=new_dict)
dict_msg = tokenize_commit_msg(data=data) labels_ = np.array([1 if w in d.split() else 0 for d in data for w in dict_msg]) labels_ = np.reshape(labels_, (int(labels_.shape[0] / len(dict_msg)), len(dict_msg))) return labels_, dict_msg if __name__ == '__main__': # create padding for commit code ################################################################################## ################################################################################## # path_train_diff = './data/2017_ASE_Jiang/train.26208.diff' # data_train_diff = load_Jiang_code_data(pfile=path_train_diff) # path_test_diff = './data/2017_ASE_Jiang/test.3000.diff' # data_test_diff = load_Jiang_code_data(pfile=path_test_diff) # data_diff = data_train_diff + data_test_diff # print(len(data_diff)) # max_line, max_length = 15, 40 # padding_commit_code(data=data_diff, max_line=max_line, max_length=max_length) # create label using the commit message ################################################################################## ################################################################################## path_train_msg = './data/2017_ASE_Jiang/train.26208.msg' data_train_msg = load_file(path_file=path_train_msg) path_test_msg = './data/2017_ASE_Jiang/test.3000.msg' data_test_msg = load_file(path_file=path_test_msg) print(len(data_train_msg + data_test_msg)) data = data_train_msg + data_test_msg label, dict_msg = commit_msg_label(data=data) print(label.shape, len(dict_msg))
def finding_element(data, indexes): new_data = [data[i] for i in indexes] return new_data def evaluation_metrics(root, target): print "Accuracy: %f" % (accuracy_score(y_true=root, y_pred=target)) print "Precision: %f" % (precision_score(y_true=root, y_pred=target)) print "Recall: %f" % (recall_score(y_true=root, y_pred=target)) print "F1: %f" % (f1_score(y_true=root, y_pred=target)) print "AUC: %f" % (auc_score(y_true=root, y_pred=target)) if __name__ == "__main__": path_gt = "./data/3_mar7/typediff_test_ver2.out" data_gt = load_file(path_gt) id_gt, lbl_gt = processing_gt(data=data_gt) path_bf = "./data/typediff_bug_and_fix.txt" data_bf = load_file(path_bf) id_bf, lbl_bf = processing_bug_fix(data=data_bf) indexes_ = finding_index(ids_root=id_gt, ids_target=id_bf) print len(indexes_) id_bf, lbl_bf = finding_element( data=id_bf, indexes=indexes_), finding_element(data=lbl_bf, indexes=indexes_) evaluation_metrics(root=lbl_gt, target=lbl_bf) # path_write = "./data/typediff_bug_and_fix_ver2.txt" # write_file(path_file=path_write, data=lbl_bf)