def loading_data_all(FLAGS): # load all data from FLAGS path # split data to training and testing, only load testing data commits_ = extract_commit(path_file=FLAGS.path) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) labels = load_label_commits(commits=filter_commits) return pad_msg, pad_added_code, pad_removed_code, labels
def loading_data(path_file): commits_ = extract_commit(path_file=path_data) nfile, nhunk, nline, nleng = 1, 8, 10, 120 filter_commits = filtering_commit(commits=commits_, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng) msgs = extract_msg(commits=filter_commits) labels = extract_label(commits=filter_commits) codes = extract_code(commits=filter_commits) all_lines = add_two_list(list1=msgs, list2=codes) return all_lines, labels
def loading_testing_data(FLAGS, path_file, type): if type == "msg": commits_ = extract_commit(path_file=path_file) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) elif type == "all": commits_ = extract_commit(path_file=path_file) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines elif type == "code": commits_ = extract_commit(path_file=path_file) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) msgs_ = codes_ else: print "You need to type correct model" exit() return msgs_, codes_, filter_commits
def loading_data_lstm(FLAGS): print FLAGS.model if "msg" in FLAGS.model: commits_ = extract_commit(path_file=FLAGS.path) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) labels = load_label_commits(commits=filter_commits) elif "all" in FLAGS.model: commits_ = extract_commit(path_file=FLAGS.path) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) labels = load_label_commits(commits=filter_commits) elif "code" in FLAGS.model: commits_ = extract_commit(path_file=FLAGS.path) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) msgs_ = codes_ dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) labels = load_label_commits(commits=filter_commits) else: print "You need to type correct model" exit() kf = KFold(n_splits=FLAGS.folds, random_state=FLAGS.seed) for train_index, test_index in kf.split(filter_commits): X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \ np.array(get_items(items=pad_msg, indexes=test_index)) X_train_added_code, X_test_added_code = np.array(get_items(items=pad_added_code, indexes=train_index)), \ np.array(get_items(items=pad_added_code, indexes=test_index)) X_train_removed_code, X_test_removed_code = np.array(get_items(items=pad_removed_code, indexes=train_index)), \ np.array(get_items(items=pad_removed_code, indexes=test_index)) y_train, y_test = np.array(get_items(items=labels, indexes=train_index)), \ np.array(get_items(items=labels, indexes=test_index)) return X_test_msg, X_test_added_code, X_test_removed_code, y_test
def filter_number_code_hunk(commits): commit_id = list() for c in commits: files = c["code"] cnt_hunk = list() for hunk in files: added_hunk, removed_hunk = hunk["added"].keys(), hunk["removed"].keys() cnt_hunk += added_hunk + removed_hunk # if max(cnt_hunk) <= num_hunk: # commit_id.append(c["id"]) print c["id"], max(cnt_hunk) if __name__ == "__main__": path_data = "./data/test_data/merging_markus_sasha.txt" commits_ = extract_commit(path_file=path_data) nfile, nhunk, nline, nleng = 1, 8, 10, 120 filter_commits = filtering_commit(commits=commits_, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng) ids_ = [c["id"] for c in filter_commits] labels_ = [c["stable"] for c in filter_commits] path_nonoverlap = "./qualitative_analysis_ver3/nonOverlap_PatchNet_all_LPU_SVM_all" id_overlap = load_file(path_file=path_nonoverlap) new_commits = list() for i in id_overlap: index_i = ids_.index(i) new_commits.append(filter_commits[index_i]) filter_number_code_hunk(commits=new_commits)
path_write = "./data_test_data_pred_results/cnn_" + type + ".txt" write_file(path_file=path_write, data=y_pred) print "Accuracy: ", accuracy_score(y_true=y_test, y_pred=y_pred) print "Precision: ", precision_score(y_true=y_test, y_pred=y_pred) print "Recall: ", recall_score(y_true=y_test, y_pred=y_pred) print "F1: ", f1_score(y_true=y_test, y_pred=y_pred) print "AUC: ", auc_score(y_true=y_test, y_pred=y_pred) if __name__ == "__main__": nfile, nhunk, nline, nleng = 1, 8, 10, 120 path_data = "./data/3_mar7/typediff.out" commits_train = extract_commit(path_file=path_data) filter_commits_train = filtering_commit(commits=commits_train, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng) msgs_train = extract_msg(commits=filter_commits_train) labels_train = extract_label(commits=filter_commits_train) codes_train = extract_code(commits=filter_commits_train) all_lines_train = add_two_list(list1=msgs_train, list2=codes_train) # path_test = "./data/test_data/sasha_translated.out" path_test = "./data/test_data/merging_markus_sasha.txt" type = "all" # type = "msg" # type = "code" commits_test = extract_commit(path_file=path_test) filter_commits_test = filtering_commit(commits=commits_test, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng)
code_length = 120 # "Max length of code in one line in commits") code_line = 10 # "Max line of code in one hunk in commits") code_hunk = 8 # "Max hunk of code in one file in commits") code_file = 1 # "Max file of code in one in commits") path_train = "./data/3_mar7/typediff.out" # data_train = load_file(path_file=path_train) # train_pad_msg, train_pad_added_code, train_pad_removed_code, train_labels, dict_msg_, dict_code_ = \ # load_commit_train_data(commits=data_train, msg_length_=msg_length, code_length_=code_length, # code_line_=code_line, code_hunk_=code_hunk, code_file_=code_file) # print train_pad_msg.shape, train_pad_added_code.shape, train_pad_removed_code.shape, train_labels.shape commits_train = extract_commit(path_file=path_train) filter_commits_train = filtering_commit(commits=commits_train, num_file=code_file, num_hunk=code_hunk, num_loc=code_line, size_line=code_length) msgs_train, codes_train = extract_msg( commits=filter_commits_train), extract_code( commits=filter_commits_train) dict_msg_train, dict_code_train = dictionary(data=msgs_train), dictionary( data=codes_train) path_test = "./data/test_data/markus_translated.out" # path_test = "./data/test_data/sasha_translated.out" commits_test = extract_commit(path_file=path_test) filter_commits_test = filtering_commit(commits=commits_test, num_file=code_file, num_hunk=code_hunk, num_loc=code_line,
if __name__ == "__main__": tf = model_parameter_evaluation_keras() FLAGS = tf.flags.FLAGS print_params(tf) path_file_model = "./keras_model/" model_name = FLAGS.model # model_name = "lstm_code" model_name = "lstm_all" model = load_model(path_file_model + model_name + ".h5") if "msg" in FLAGS.model: commits_ = extract_commit(path_file=FLAGS.path) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) elif "all" in FLAGS.model: commits_ = extract_commit(path_file=FLAGS.path) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines