def create_features_ICSE_new(commits_train, ids_train, commits_test, ids_test, type): new_commits_train, new_commits_test = list(), list() for id_ in ids_train: for c in commits_train: if c["id"] == id_: new_commits_train.append(c) break for id_ in ids_test: for c in commits_test: if c["id"] == id_: new_commits_test.append(c) break vectorizer = CountVectorizer() if type == "msg": msg_train, msg_test = extract_msg(commits=new_commits_train), extract_msg(commits=new_commits_test) X_train = vectorizer.fit_transform(msg_train) X_test = vectorizer.transform(msg_test) elif type == "code": codes_train, codes_test = extract_code(commits=new_commits_train), extract_code(commits=new_commits_test) X_train = vectorizer.fit_transform(codes_train) X_test = vectorizer.transform(codes_test) elif type == "all": msg_train, msg_test = extract_msg(commits=new_commits_train), extract_msg(commits=new_commits_test) codes_train, codes_test = extract_code(commits=new_commits_train), extract_code(commits=new_commits_test) all_lines_train = add_two_list(list1=msg_train, list2=codes_train) all_lines_test = add_two_list(list1=msg_test, list2=codes_test) X_train = vectorizer.fit_transform(all_lines_train) X_test = vectorizer.transform(all_lines_test) else: print "Your type is uncorrect" exit() return X_train.toarray(), X_test.toarray()
def loading_data_all(FLAGS): # load all data from FLAGS path # split data to training and testing, only load testing data commits_ = extract_commit(path_file=FLAGS.path) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) labels = load_label_commits(commits=filter_commits) return pad_msg, pad_added_code, pad_removed_code, labels
def create_features_ICSE(commits, ids, type): new_commits = list() for id_ in ids: for c in commits: if c["id"] == id_: new_commits.append(c) break vectorizer = CountVectorizer() if type == "msg": msgs = extract_msg(commits=new_commits) X = vectorizer.fit_transform(msgs) elif type == "code": codes = extract_code(commits=new_commits) X = vectorizer.fit_transform(codes) elif type == "all": msgs = extract_msg(commits=new_commits) codes = extract_code(commits=new_commits) all_lines = add_two_list(list1=msgs, list2=codes) X = vectorizer.fit_transform(all_lines) else: print "Your type is uncorrect" exit() return X.toarray()
def loading_testing_data(FLAGS, path_file, type): if type == "msg": commits_ = extract_commit(path_file=path_file) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) elif type == "all": commits_ = extract_commit(path_file=path_file) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines elif type == "code": commits_ = extract_commit(path_file=path_file) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) msgs_ = codes_ else: print "You need to type correct model" exit() return msgs_, codes_, filter_commits
def load_data_type(path, FLAGS): commits_ = extract_commit_july(path_file=path) msgs_, codes_ = extract_msg(commits=commits_), extract_code( commits=commits_) dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) print len(commits_), len(dict_msg_), len(dict_code_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) pad_added_code = mapping_commit_code(type="added", commits=commits_, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) pad_removed_code = mapping_commit_code(type="removed", commits=commits_, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) labels = load_label_commits(commits=commits_) return pad_msg, pad_added_code, pad_removed_code, labels, dict_msg_, dict_code_
def loading_data_lstm(FLAGS): print FLAGS.model if "msg" in FLAGS.model: commits_ = extract_commit(path_file=FLAGS.path) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) labels = load_label_commits(commits=filter_commits) elif "all" in FLAGS.model: commits_ = extract_commit(path_file=FLAGS.path) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) labels = load_label_commits(commits=filter_commits) elif "code" in FLAGS.model: commits_ = extract_commit(path_file=FLAGS.path) filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk, num_loc=FLAGS.code_line, size_line=FLAGS.code_length) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) msgs_ = codes_ dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk, max_code_line=FLAGS.code_line, max_code_length=FLAGS.code_length, dict_code=dict_code_) labels = load_label_commits(commits=filter_commits) else: print "You need to type correct model" exit() kf = KFold(n_splits=FLAGS.folds, random_state=FLAGS.seed) for train_index, test_index in kf.split(filter_commits): X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \ np.array(get_items(items=pad_msg, indexes=test_index)) X_train_added_code, X_test_added_code = np.array(get_items(items=pad_added_code, indexes=train_index)), \ np.array(get_items(items=pad_added_code, indexes=test_index)) X_train_removed_code, X_test_removed_code = np.array(get_items(items=pad_removed_code, indexes=train_index)), \ np.array(get_items(items=pad_removed_code, indexes=test_index)) y_train, y_test = np.array(get_items(items=labels, indexes=train_index)), \ np.array(get_items(items=labels, indexes=test_index)) return X_test_msg, X_test_added_code, X_test_removed_code, y_test
path_test.append("./data/test_data/nicholask_translated.out") path_test.append("./data/test_data/sasha_translated.out") path_dict = "./data/3_mar7/newres.simplified.dict" dict_index = load_file(path_file=path_dict) new_dict = {} for d in dict_index: split_d = d.strip().split(":") new_dict[int(split_d[0])] = split_d[1] data = list() for p in path_test: p_data = load_file(path_file=p) data += p_data commits_ = extract_commit_new(commits=data) msgs = extract_msg(commits=commits_) codes = extract_code(commits=commits_) all_lines = add_two_list(list1=msgs, list2=codes) labels = extract_label(commits=commits_) # pos_label = len([1 for l in labels if l == 1]) # neg_label = len([0 for l in labels if l == 0]) print len(labels), np.count_nonzero(np.array(labels)) cnt = 1 for i in all_lines: split_i = i.split() for j in split_i: if int(j) == 0: print i break
cnt += 1 return write_data # exit() if __name__ == "__main__": # path_data = "./satisfy_typediff_sorted.out" path_data = "./newres_funcalls_jul28.out.sorted.satisfy" commits_ = extract_commit_july(path_file=path_data) filter_commits = commits_ print len(filter_commits), type(filter_commits) commits_id = [c["id"] for c in commits_] print len(commits_id) # load_model_labels(id=commits_id) msgs = extract_msg(commits=filter_commits) labels = extract_label(commits=filter_commits) codes = extract_code(commits=filter_commits) all_lines = add_two_list(list1=msgs, list2=codes) vectorizer = CountVectorizer() X = vectorizer.fit_transform(all_lines) print X.shape # exit() path_good_commits = "./statistical_test_prob_ver3/good_commits.txt" good_commits = load_file(path_file=path_good_commits) print "Leng of good commits: %s" % (str(len(good_commits))) write_data = [] for g in good_commits: write_data += similarity_good_commit(id=commits_id,
print "Accuracy: ", accuracy_score(y_true=y_test, y_pred=y_pred) print "Precision: ", precision_score(y_true=y_test, y_pred=y_pred) print "Recall: ", recall_score(y_true=y_test, y_pred=y_pred) print "F1: ", f1_score(y_true=y_test, y_pred=y_pred) print "AUC: ", auc_score(y_true=y_test, y_pred=y_pred) if __name__ == "__main__": nfile, nhunk, nline, nleng = 1, 8, 10, 120 path_data = "./data/3_mar7/typediff.out" commits_train = extract_commit(path_file=path_data) filter_commits_train = filtering_commit(commits=commits_train, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng) msgs_train = extract_msg(commits=filter_commits_train) labels_train = extract_label(commits=filter_commits_train) codes_train = extract_code(commits=filter_commits_train) all_lines_train = add_two_list(list1=msgs_train, list2=codes_train) # path_test = "./data/test_data/sasha_translated.out" path_test = "./data/test_data/merging_markus_sasha.txt" type = "all" # type = "msg" # type = "code" commits_test = extract_commit(path_file=path_test) filter_commits_test = filtering_commit(commits=commits_test, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng) if type == "all": msgs_test = extract_msg(commits=filter_commits_test)
mini_batches.append(mini_batch) return mini_batches if __name__ == "__main__": # path_data = "./data/1_oct5/sample_eq100_line_oct5.out" path_data = "./data/1_oct5/eq100_line_oct5.out" # path_data = "./data/3_mar7/typediff.out" commits_ = extract_commit(path_file=path_data) nfile, nhunk, nline, nleng = 1, 8, 10, 120 filter_commits = filtering_commit(commits=commits_, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng) msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) print "Max length of commit msg: %i" % max( [len(m.split(" ")) for m in msgs_]) print "Size of message and code dictionary: %i, %i" % (len(dict_msg_), len(dict_code_)) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=128, dict_msg=dict_msg_) pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=nhunk, max_code_line=nline, max_code_length=nleng, dict_code=dict_code_)
def loading_baseline_july(tf, folds, random_state): FLAGS = tf.flags.FLAGS commits_ = extract_commit_july(path_file=FLAGS.path) filter_commits = commits_ print len(commits_) kf = KFold(n_splits=folds, random_state=random_state) idx_folds = list() for train_index, test_index in kf.split(filter_commits): idx = dict() idx["train"], idx["test"] = train_index, test_index idx_folds.append(idx) if "msg" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) elif "all" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines elif "code" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) msgs_ = codes_ else: print "You need to type correct model" exit() dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) labels = load_label_commits(commits=filter_commits) labels = convert_to_binary(labels) # path_file = "./statistical_test_prob/true_label.txt" # write_file(path_file=path_file, data=labels) # exit() print pad_msg.shape, labels.shape, len(dict_msg_) cntfold = 0 pred_dict = dict() pred_dict_list = list() for i in xrange(cntfold, len(idx_folds)): idx = idx_folds[i] train_index, test_index = idx["train"], idx["test"] X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \ np.array(get_items(items=pad_msg, indexes=test_index)) Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \ np.array(get_items(items=labels, indexes=test_index)) if FLAGS.model == "lstm_cnn_all" or FLAGS.model == "lstm_cnn_msg" \ or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "cnn_all" \ or FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code": # path_model = "./keras_model/%s_%s.h5" % (FLAGS.model, str(cntfold)) path_model = "./keras_model/test_%s_%s.h5" % (FLAGS.model, str(cntfold)) # path_model = "./keras_model/%s_%s_testing.h5" % (FLAGS.model, str(cntfold)) model = load_model(path_model) else: print "You need to give correct model name" exit() y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size) y_pred = np.ravel(y_pred) pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index)) y_pred = y_pred.tolist() pred_dict_list += y_pred # print len(pred_dict_list) # exit() # path_file = "./statistical_test_prob/" + FLAGS.model + ".txt" # write_file(path_file=path_file, data=sorted_dict(dict=pred_dict)) path_file = "./statistical_test_prob/" + FLAGS.model + "_checking.txt" write_file(path_file=path_file, data=pred_dict_list)
code_file = 1 # "Max file of code in one in commits") path_train = "./data/3_mar7/typediff.out" # data_train = load_file(path_file=path_train) # train_pad_msg, train_pad_added_code, train_pad_removed_code, train_labels, dict_msg_, dict_code_ = \ # load_commit_train_data(commits=data_train, msg_length_=msg_length, code_length_=code_length, # code_line_=code_line, code_hunk_=code_hunk, code_file_=code_file) # print train_pad_msg.shape, train_pad_added_code.shape, train_pad_removed_code.shape, train_labels.shape commits_train = extract_commit(path_file=path_train) filter_commits_train = filtering_commit(commits=commits_train, num_file=code_file, num_hunk=code_hunk, num_loc=code_line, size_line=code_length) msgs_train, codes_train = extract_msg( commits=filter_commits_train), extract_code( commits=filter_commits_train) dict_msg_train, dict_code_train = dictionary(data=msgs_train), dictionary( data=codes_train) path_test = "./data/test_data/markus_translated.out" # path_test = "./data/test_data/sasha_translated.out" commits_test = extract_commit(path_file=path_test) filter_commits_test = filtering_commit(commits=commits_test, num_file=code_file, num_hunk=code_hunk, num_loc=code_line, size_line=code_length) msgs_test, codes_test = extract_msg( commits=filter_commits_test), extract_code(commits=filter_commits_test) all_lines_test = add_two_list(list1=msgs_test, list2=codes_test)
def running_baseline_july(tf, folds, random_state): FLAGS = tf.flags.FLAGS commits_ = extract_commit_july(path_file=FLAGS.path) filter_commits = commits_ print len(commits_) kf = KFold(n_splits=folds, random_state=random_state) idx_folds = list() for train_index, test_index in kf.split(filter_commits): idx = dict() idx["train"], idx["test"] = train_index, test_index idx_folds.append(idx) if "msg" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) elif "all" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines elif "code" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) msgs_ = codes_ else: print "You need to type correct model" exit() dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) labels = load_label_commits(commits=filter_commits) labels = convert_to_binary(labels) print pad_msg.shape, labels.shape, len(dict_msg_) # exit() timestamp = str(int(time.time())) accuracy, precision, recall, f1, auc = list(), list(), list(), list(), list() cntfold = 0 pred_dict, pred_dict_prob = dict(), dict() for i in xrange(cntfold, len(idx_folds)): idx = idx_folds[i] train_index, test_index = idx["train"], idx["test"] X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \ np.array(get_items(items=pad_msg, indexes=test_index)) Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \ np.array(get_items(items=labels, indexes=test_index)) if FLAGS.model == "lstm_cnn_msg" or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "lstm_cnn_all": model = lstm_cnn(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg, y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS) elif FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code" or FLAGS.model == "cnn_all": model = cnn_model(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg, y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS) else: print "You need to give correct model name" exit() # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + ".h5") # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + "_testing.h5") # model.save("./keras_model/test_" + FLAGS.model + "_" + str(cntfold) + ".h5") model.save("./keras_model/newres_funcalls_" + FLAGS.model + "_" + str(cntfold) + ".h5") y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size) y_pred = np.ravel(y_pred) y_pred_tolist = y_pred.tolist() data_fold = [str(i) + "\t" + str(l) for i, l in zip(test_index, y_pred)] path_file = "./statistical_test/newres_funcalls_%s_fold_%s.txt" % (FLAGS.model, str(cntfold)) write_file(path_file=path_file, data=data_fold) y_pred[y_pred > 0.5] = 1 y_pred[y_pred <= 0.5] = 0 pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index)) accuracy.append(accuracy_score(y_true=Y_test, y_pred=y_pred)) precision.append(precision_score(y_true=Y_test, y_pred=y_pred)) recall.append(recall_score(y_true=Y_test, y_pred=y_pred)) f1.append(f1_score(y_true=Y_test, y_pred=y_pred)) auc.append(auc_score(y_true=Y_test, y_pred=y_pred)) print "accuracy", accuracy_score(y_true=Y_test, y_pred=y_pred) print "precision", precision_score(y_true=Y_test, y_pred=y_pred) print "recall", recall_score(y_true=Y_test, y_pred=y_pred) print "f1", f1_score(y_true=Y_test, y_pred=y_pred) cntfold += 1 break