def baseline_testing(X_train, y_train, X_test, y_test, algorithm, type): vectorizer = CountVectorizer() X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) print X_train.shape, X_test.shape if algorithm == "svm": clf = LinearSVC(random_state=0) elif algorithm == "lr": clf = LogisticRegression() elif algorithm == "dt": clf = DecisionTreeClassifier() elif algorithm == "nb": clf = GaussianNB() else: print "Wrong algorithm name -- please retype again" exit() clf.fit(X=X_train.toarray(), y=y_train) y_pred = clf.predict(X_test) path_write = "./data_test_data_pred_results/cnn_" + type + ".txt" write_file(path_file=path_write, data=y_pred) print "Accuracy: ", accuracy_score(y_true=y_test, y_pred=y_pred) print "Precision: ", precision_score(y_true=y_test, y_pred=y_pred) print "Recall: ", recall_score(y_true=y_test, y_pred=y_pred) print "F1: ", f1_score(y_true=y_test, y_pred=y_pred) print "AUC: ", auc_score(y_true=y_test, y_pred=y_pred)
def qualitative_looking(path_correctId, path_label): id_root = load_file(path_file=path_correctId) print len(id_root) data_label = load_file(path_file=path_label) id_label = [d.split("\t")[0] for d in data_label] print len(data_label) index_id_root = [id_label.index(i) for i in id_root] for id_ in id_root: path_id = "./qualitative_analysis/cosine_sim/" + id_ + ".txt" cosine_data = load_file(path_file=path_id) print len(cosine_data) cosine_data = map(float, cosine_data) order_cosine = sorted(cosine_data, key=float, reverse=True) write_data = list() write_data = dict() for jid in index_id_root: name_id = id_label[jid] cosine_score = cosine_data[jid] position_ = order_cosine.index(cosine_score) # print name_id + "\t" + str(cosine_score) + "\t" + str(position_ + 1) # write_data.append(name_id + "\t" + str(cosine_score) + "\t" + str(position_ + 1)) write_data[name_id] = position_ + 1 new_write_data = list() for w in sorted(write_data, key=write_data.get): print w, write_data[w] new_write_data.append(w + "\t" + str(write_data[w])) path_write = "./qualitative_analysis/cosine_sim_order/" + id_ + ".txt" write_file(path_file=path_write, data=new_write_data)
def get_commit_satisfy_condition(path_data_, nfile, nhunk, nline, nleng): commits_structure = extract_commit_july(path_file=path_data_) # nfile, nhunk, nline, nleng = 1, 8, 10, 120 filter_commits = filtering_commit_union(commits=commits_structure, num_file=nfile, num_hunk=nhunk, num_loc=nline, size_line=nleng) print len(commits_structure), len(filter_commits) commits = load_file(path_data_) indexes = commits_index(commits=commits) new_commits = list() for i in xrange(0, len(indexes)): if i == len(indexes) - 1: id = commit_id(commit=commits[indexes[i]:]) if id in filter_commits: new_commits += commits[indexes[i]:] else: id = commit_id(commit=commits[indexes[i]:indexes[i + 1]]) if id in filter_commits: new_commits += commits[indexes[i]:indexes[i + 1]] print i, id # write_file("./satisfy_typediff_sorted.out", new_commits) write_file(path_data_ + ".satisfy", new_commits)
def predict_model(commits, params): path_dict = os.path.abspath(os.path.join(os.path.curdir, params.model)) dict_msg = load_dict_file(path_file=path_dict + '/dict_msg.txt') dict_code = load_dict_file(path_file=path_dict + '/dict_code.txt') pad_msg, pad_added_code, pad_removed_code, labels = padding_pred_commit( commits=commits, params=params, dict_msg=dict_msg, dict_code=dict_code) # print pad_msg.shape, pad_added_code.shape, pad_removed_code.shape, labels.shape checkpoint_dir = path_dict checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=params.allow_soft_placement, log_device_placement=params.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_msg = graph.get_operation_by_name("input_msg").outputs[0] input_addedcode = graph.get_operation_by_name( "input_addedcode").outputs[0] input_removedcode = graph.get_operation_by_name( "input_removedcode").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate scores = graph.get_operation_by_name("output/scores").outputs[0] # Generate batches for one epoch batches = mini_batches(X_msg=pad_msg, X_added_code=pad_added_code, X_removed_code=pad_removed_code, Y=labels, mini_batch_size=params.batch_size) # Collect the predictions here commits_scores = list() for batch in batches: batch_input_msg, batch_input_added_code, batch_input_removed_code, batch_input_labels = batch batch_scores = sess.run( scores, { input_msg: batch_input_msg, input_addedcode: batch_input_added_code, input_removedcode: batch_input_removed_code, dropout_keep_prob: 1.0 }) batch_scores = np.ravel(softmax(batch_scores)[:, [1]]) commits_scores = np.concatenate([commits_scores, batch_scores]) write_file( path_file=os.path.abspath(os.path.join(os.path.curdir)) + '/prediction.txt', data=commits_scores)
def print_false_negative(id, y_pred, threshold, y_true): y_pred = [1 if float(y) > threshold else 0 for y in y_pred] false_negative = [] for i, p, t in zip(id, y_pred, y_true): if p == 0 and t == 1: false_negative.append(i) print len(false_negative) path_write = "./sasha_results/false_neg_%s.txt" % (str(threshold)) write_file(path_file=path_write, data=false_negative)
def print_true_positive(id, y_pred, threshold, y_true): y_pred = [1 if float(y) > threshold else 0 for y in y_pred] true_positive = [] for i, p, t in zip(id, y_pred, y_true): if p == t and t == 1: true_positive.append(i) print len(true_positive) path_write = "./sasha_results/true_pos_%s.txt" % (str(threshold)) write_file(path_file=path_write, data=true_positive)
def getting_overlap(commits, ids_ftr, data_ftr, path_name): new_ftr = list() for c in commits: if c["id"] in ids_ftr: index_ftr = ids_ftr.index(c["id"]) label_ftr = c["stable"] line_ftr = data_ftr[index_ftr] + "," + label_ftr new_ftr.append(line_ftr) write_file(path_name, new_ftr) return new_ftr
def balance_data_ICSE(path): data = load_file(path_file=path) new_data, cnt = list(), 0 for d in data: if "true" in d and cnt <= 11165: new_data.append(d.strip()) cnt += 1 elif "false" in d: new_data.append(d.strip()) shuffle(new_data) write_file(path_file="./data/3_mar7/new_features_ver1.txt", data=new_data) exit()
def cosine_similarity(path_root, id_commit, data): vectorizer = CountVectorizer() X = vectorizer.fit_transform(data) data_root = load_file(path_file=path_root) for id_root in data_root: results = list() index_ = id_commit.index(id_root) X_root = X[index_, :].toarray().flatten() for i in xrange(len(id_commit)): results.append( 1 - spatial.distance.cosine(X_root, X[i, :].toarray().flatten())) write_file(path_file="./qualitative_analysis/cosine_sim/" + id_root + ".txt", data=results) return None
def restruct_root(roots, path, type): min_files = min([len(r) for r in roots]) new_roots = [r[:min_files] for r in roots] print len(new_roots), len(new_roots[0]) for i in xrange(0, len(new_roots[0])): model = list() for j in xrange(0, len(new_roots)): print path + "/" + new_roots[j][i] model += load_file(path + "/" + new_roots[j][i]) model_name = "model-" + new_roots[j][i].split("-")[-1].replace( ".txt", "") print type, model_name # exit() path_write = "./patchNet_mergeResults/%s_%s.txt" % (type, model_name) write_file(path_file=path_write, data=model) return None
def get_predict(name, X, y, algorithm, folds): kf = KFold(n_splits=folds, random_state=0) kf.get_n_splits(X=X) auc, accuracy, precision, recall, f1 = list(), list(), list(), list(), list() pred_dict = dict() for train_index, test_index in kf.split(X): X_train, y_train = get_items(items=X, indexes=train_index), get_items(items=y, indexes=train_index) X_test, y_test = get_items(items=X, indexes=test_index), get_items(items=y, indexes=test_index) vectorizer = CountVectorizer() X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) if algorithm == "svm": clf = LinearSVC(random_state=0) elif algorithm == "lr": clf = LogisticRegression() elif algorithm == "dt": clf = DecisionTreeClassifier() else: print "Wrong algorithm name -- please retype again" exit() clf.fit(X=X_train, y=y_train) y_pred = clf.predict(X_test) pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index)) # path_file = "./statistical_test/" + name + "_" + algorithm + ".txt" # path_file = "./statistical_test/3_mar7/" + name + "_" + algorithm + ".txt" # path_file = "./statistical_test_ver2/3_mar7" + name + "_" + algorithm + ".txt" # write_file(path_file, y_pred) accuracy.append(accuracy_score(y_true=y_test, y_pred=y_pred)) precision.append(precision_score(y_true=y_test, y_pred=y_pred)) recall.append(recall_score(y_true=y_test, y_pred=y_pred)) f1.append(f1_score(y_true=y_test, y_pred=y_pred)) auc.append(auc_score(y_true=y_test, y_pred=y_pred)) path_file = "./statistical_test_ver2/3_mar7/" + name + "_" + algorithm + ".txt" write_file(path_file=path_file, data=sorted_dict(dict=pred_dict)) print "Accuracy and std of %s: %f %f" % (algorithm, np.mean(np.array(accuracy)), np.std(np.array(accuracy))) print "Precision of %s: %f %f" % (algorithm, np.mean(np.array(precision)), np.std(np.array(precision))) print "Recall of %s: %f %f" % (algorithm, np.mean(np.array(recall)), np.std(np.array(recall))) print "F1 of %s: %f %f" % (algorithm, np.mean(np.array(f1)), np.std(np.array(f1))) print "AUC of %s: %f %f" % (algorithm, np.mean(np.array(auc)), np.std(np.array(auc)))
def get_predict_ICSE_writePred(X_train, y_train, X_test, y_test, algorithm, path_write): if algorithm == "svm": clf = LinearSVC() elif algorithm == "lr": clf = LogisticRegression() elif algorithm == "dt": clf = DecisionTreeClassifier() else: print "Wrong algorithm name -- please retype again" exit() clf.fit(X=X_train, y=y_train) y_pred = clf.predict(X_test) write_file(path_file=path_write, data=y_pred) print "Accuracy of %s: %f" % (algorithm, accuracy_score(y_true=y_test, y_pred=y_pred)) print "Precision of %s: %f" % (algorithm, precision_score(y_true=y_test, y_pred=y_pred)) print "Recall of %s: %f" % (algorithm, recall_score(y_true=y_test, y_pred=y_pred)) print "F1 of %s: %f" % (algorithm, f1_score(y_true=y_test, y_pred=y_pred)) print "AUC of %s: %f" % (algorithm, auc_score(y_true=y_test, y_pred=y_pred))
def get_commit_id_and_date(path_data_): commits = load_file(path_data_) indexes = commits_index(commits=commits) dicts = {} for i in xrange(0, len(indexes)): if i == len(indexes) - 1: date = commit_date_july(commit=commits[indexes[i]:]) else: date = commit_date_july(commit=commits[indexes[i]:indexes[i + 1]]) dicts[i] = int(date) sort_dicts = sorted(dicts.items(), key=operator.itemgetter(1)) new_commits = list() for d in sort_dicts: index, date = d[0], d[1] print index, date if index == len(sort_dicts) - 1: new_commits += commits[indexes[index]:] else: new_commits += commits[indexes[index]:indexes[index + 1]] # write_file("./typediff_sorted.out", new_commits) write_file(path_data_ + ".sorted", new_commits)
def creating_sasha_data(path_data_, folds, random_state): commits_structure = extract_commit_july(path_file=path_data_) commits_id = [c["id"] for c in commits_structure] commits_label = ["stable" if c["stable"] == "true" else "nonstable" for c in commits_structure] commits_id_label = [id_ + "\t" + label_ for id_, label_ in zip(commits_id, commits_label)] kf = KFold(n_splits=folds, random_state=random_state) cnt_fold = 1 for train_index, test_index in kf.split(commits_structure): train_id, train_label = get_elements(commits=commits_id, indexes=train_index), get_elements( commits=commits_label, indexes=train_index) test_id, test_label = get_elements(commits=commits_id, indexes=test_index), get_elements( commits=commits_label, indexes=test_index) train_file, test_file = get_elements(commits=commits_id_label, indexes=train_index), get_elements( commits=commits_id_label, indexes=test_index) print len(train_id), len(train_label) print len(test_id), len(test_label) print len(train_file), len(test_file) write_file(path_file="./sasha_data/fold" + str(cnt_fold) + "/" + "train.txt", data=train_file) write_file(path_file="./sasha_data/fold" + str(cnt_fold) + "/" + "test.txt", data=test_file) cnt_fold += 1
def baseline_ver3(id, train, label, algorithm): X_train, y_train = train, label X_test, y_test = train, label id_train, id_test = id, id vectorizer = CountVectorizer() X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) # X = vectorizer.transform(X) # eval_train, eval_labels = loading_data("./data/3_mar7/typeaddres.out") # eval_train = vectorizer.transform(eval_train) if algorithm == "svm": clf = LinearSVC(random_state=0) elif algorithm == "lr": clf = LogisticRegression() elif algorithm == "dt": clf = DecisionTreeClassifier() else: print "Wrong algorithm name -- please retype again" exit() clf.fit(X=X_train, y=y_train) accuracy = accuracy_score(y_true=y_test, y_pred=clf.predict(X_test)) precision = precision_score(y_true=y_test, y_pred=clf.predict(X_test)) recall = recall_score(y_true=y_test, y_pred=clf.predict(X_test)) f1 = f1_score(y_true=y_test, y_pred=clf.predict(X_test)) auc = auc_score(y_true=y_test, y_pred=clf.predict(X_test)) print "Accuracy:", accuracy print "Precision:", precision print "Recall:", recall print "F1:", f1 print "AUC:", auc probs = clf.predict_proba(X_test)[:, 1] path_write = "./statistical_test_ver2/%s.txt" % (algorithm) write_file(path_file=path_write, data=probs)
model = lstm_cnn(x_train=pad_msg_train, y_train=Y_train, x_test=pad_msg_test, y_test=Y_test, dictionary_size=len(dict_train), FLAGS=FLAGS) elif name == "cnn_msg" or name == "cnn_code" or name == "cnn_all": model = cnn_model(x_train=pad_msg_train, y_train=Y_train, x_test=pad_msg_test, y_test=Y_test, dictionary_size=len(dict_train), FLAGS=FLAGS) model.save("./lstm_model_ver3/" + name + ".h5") y_pred = model.predict(pad_msg_test, batch_size=FLAGS.batch_size) y_pred = np.ravel(y_pred) y_pred[y_pred > 0.5] = 1 y_pred[y_pred <= 0.5] = 0 path_file = "./data/test_data_pred_results/" + name + ".txt" write_file(path_file, y_pred) print "Accuracy of %s: %f" % (name, accuracy_score(y_true=Y_test, y_pred=y_pred)) print "Precision of %s: %f" % ( name, precision_score(y_true=Y_test, y_pred=y_pred)) print "Recall of %s: %f" % (name, recall_score(y_true=Y_test, y_pred=y_pred)) print "F1 of %s: %f" % (name, f1_score(y_true=Y_test, y_pred=y_pred)) print "AUC of %s: %f" % (name, f1_score(y_true=Y_test, y_pred=y_pred))
def eval_patchNet_train_test(tf, checkpoint_dir, test): FLAGS = tf.flags.FLAGS allow_soft_placement = True # "Allow device soft device placement" log_device_placement = False # "Log placement of ops on devices" dirs = get_all_checkpoints(checkpoint_dir=checkpoint_dir) graph = tf.Graph() X_test_msg, X_test_added_code, X_test_removed_code, y_test = test[0], test[ 1], test[2], test[3] for checkpoint_file in dirs: with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_msg = graph.get_operation_by_name("input_msg").outputs[0] input_addedcode = graph.get_operation_by_name( "input_addedcode").outputs[0] input_removedcode = graph.get_operation_by_name( "input_removedcode").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] scores = graph.get_operation_by_name( "output/scores").outputs[0] # Generate batches for one epoch batches = mini_batches(X_msg=X_test_msg, X_added_code=X_test_added_code, X_removed_code=X_test_removed_code, Y=y_test, mini_batch_size=FLAGS.batch_size) # Collect the predictions here all_predictions, all_scores = [], [] for batch in batches: batch_input_msg, batch_input_added_code, batch_input_removed_code, batch_input_labels = batch batch_predictions = sess.run( predictions, { input_msg: batch_input_msg, input_addedcode: batch_input_added_code, input_removedcode: batch_input_removed_code, dropout_keep_prob: 1.0 }) # print batch_predictions.shape all_predictions = np.concatenate( [all_predictions, batch_predictions]) batch_scores = sess.run( scores, { input_msg: batch_input_msg, input_addedcode: batch_input_added_code, input_removedcode: batch_input_removed_code, dropout_keep_prob: 1.0 }) batch_scores = np.ravel(softmax(batch_scores)[:, [1]]) # print batch_scores.shape all_scores = np.concatenate([all_scores, batch_scores]) split_checkpoint_file = checkpoint_file.split("/") path_write = "./patchNet_results/%s_%s.txt" % ( split_checkpoint_file[-3], split_checkpoint_file[-1]) write_file(path_file=path_write, data=all_scores) print checkpoint_file, "Accuracy:", accuracy_score( y_true=convert_to_binary(y_test), y_pred=all_predictions) print checkpoint_file, "Precision:", precision_score( y_true=convert_to_binary(y_test), y_pred=all_predictions) print checkpoint_file, "Recall:", recall_score( y_true=convert_to_binary(y_test), y_pred=all_predictions) print checkpoint_file, "F1:", f1_score( y_true=convert_to_binary(y_test), y_pred=all_predictions) print checkpoint_file, "AUC:", auc_score( y_true=convert_to_binary(y_test), y_pred=all_predictions) print "\n"
print X.shape # exit() path_good_commits = "./statistical_test_prob_ver3/good_commits.txt" good_commits = load_file(path_file=path_good_commits) print "Leng of good commits: %s" % (str(len(good_commits))) write_data = [] for g in good_commits: write_data += similarity_good_commit(id=commits_id, root=g, all=X, top_k=50) # break path_write = "./statistical_test_prob_ver2/good_commits_results.txt" write_file(path_file=path_write, data=write_data) # exit() #################################################################################### #################################################################################### path_bad_commits = "./statistical_test_prob_ver2/bad_commits.txt" bad_commits = load_file(path_file=path_bad_commits) print "Leng of bad commits: %s" % (str(len(bad_commits))) write_data = [] for g in bad_commits: write_data += similarity_bad_commit(id=commits_id, root=g, all=X, top_k=75) # break path_write = "./statistical_test_prob_ver2/bad_commits_results.txt"
new_dict = {} for i in index_: new_dict[i] = dictionary[i] new_list = list() for key, value in sorted(new_dict.iteritems()): new_list.append(str(key) + ": " + value) return new_list if __name__ == "__main__": path_data = "./data/3_mar7/typediff.out" commits_ = extract_commit(path_file=path_data) msgs = extract_msg(commits=commits_) codes = extract_code(commits=commits_) all_lines = add_two_list(list1=msgs, list2=codes) print len(all_lines), len(commits_), len(msgs), len(codes) index = create_dict(all_lines) print len(index) path_dict = "./data/3_mar7/newres.dict" dict_index = load_file(path_file=path_dict) new_dict = {} for d in dict_index: split_d = d.strip().split(":") new_dict[int(split_d[0])] = split_d[1] print len(new_dict) new_dict = mapping_dict(index_=index, dictionary=new_dict) path_write = "./data/3_mar7/newres.simplified.dict" write_file(path_file=path_write, data=new_dict)
def loading_baseline_july(tf, folds, random_state): FLAGS = tf.flags.FLAGS commits_ = extract_commit_july(path_file=FLAGS.path) filter_commits = commits_ print len(commits_) kf = KFold(n_splits=folds, random_state=random_state) idx_folds = list() for train_index, test_index in kf.split(filter_commits): idx = dict() idx["train"], idx["test"] = train_index, test_index idx_folds.append(idx) if "msg" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) elif "all" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines elif "code" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code( commits=filter_commits) msgs_ = codes_ else: print "You need to type correct model" exit() dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) labels = load_label_commits(commits=filter_commits) labels = convert_to_binary(labels) # path_file = "./statistical_test_prob/true_label.txt" # write_file(path_file=path_file, data=labels) # exit() print pad_msg.shape, labels.shape, len(dict_msg_) cntfold = 0 pred_dict = dict() pred_dict_list = list() for i in xrange(cntfold, len(idx_folds)): idx = idx_folds[i] train_index, test_index = idx["train"], idx["test"] X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \ np.array(get_items(items=pad_msg, indexes=test_index)) Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \ np.array(get_items(items=labels, indexes=test_index)) if FLAGS.model == "lstm_cnn_all" or FLAGS.model == "lstm_cnn_msg" \ or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "cnn_all" \ or FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code": # path_model = "./keras_model/%s_%s.h5" % (FLAGS.model, str(cntfold)) path_model = "./keras_model/test_%s_%s.h5" % (FLAGS.model, str(cntfold)) # path_model = "./keras_model/%s_%s_testing.h5" % (FLAGS.model, str(cntfold)) model = load_model(path_model) else: print "You need to give correct model name" exit() y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size) y_pred = np.ravel(y_pred) pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index)) y_pred = y_pred.tolist() pred_dict_list += y_pred # print len(pred_dict_list) # exit() # path_file = "./statistical_test_prob/" + FLAGS.model + ".txt" # write_file(path_file=path_file, data=sorted_dict(dict=pred_dict)) path_file = "./statistical_test_prob/" + FLAGS.model + "_checking.txt" write_file(path_file=path_file, data=pred_dict_list)
def train_model_mini_batches_update(train, test, dictionary, params): ##################################################################################################### # training model using 50% of positive and 50% of negative data in mini batch ##################################################################################################### ids_train, labels_train, msg_train, code_train = train ids_test, labels_test, msg_test, code_test = test dict_msg, dict_code = dictionary print('Dictionary message: %i -- Dictionary code: %i' % (len(dict_msg), len(dict_code))) print('Training data') info_label(labels_train) pad_msg_train = padding_data(data=msg_train, dictionary=dict_msg, params=params, type='msg') pad_code_train = padding_data(data=code_train, dictionary=dict_code, params=params, type='code') print('Testing data') info_label(labels_test) pad_msg_test = padding_data(data=msg_test, dictionary=dict_msg, params=params, type='msg') pad_code_test = padding_data(data=code_test, dictionary=dict_code, params=params, type='code') # set up parameters params.cuda = (not params.no_cuda) and torch.cuda.is_available() del params.no_cuda params.filter_sizes = [int(k) for k in params.filter_sizes.split(',')] params.save_dir = os.path.join( params.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) params.vocab_msg, params.vocab_code = len(dict_msg), len(dict_code) if len(labels_train.shape) == 1: params.class_num = 1 else: params.class_num = labels_train.shape[1] params.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # create and train the defect model model = DefectNet(args=params) if torch.cuda.is_available(): model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=params.l2_reg_lambda) steps = 0 batches_test = mini_batches(X_msg=pad_msg_test, X_code=pad_code_test, Y=labels_test) write_log = list() for epoch in range(1, params.num_epochs + 1): # building batches for training model batches_train = mini_batches_update(X_msg=pad_msg_train, X_code=pad_code_train, Y=labels_train) for batch in batches_train: pad_msg, pad_code, labels = batch if torch.cuda.is_available(): pad_msg, pad_code, labels = torch.tensor( pad_msg).cuda(), torch.tensor( pad_code).cuda(), torch.cuda.FloatTensor(labels) else: pad_msg, pad_code, labels = torch.tensor(pad_msg).long( ), torch.tensor(pad_code).long(), torch.tensor(labels).float() optimizer.zero_grad() ftr, predict = model.forward(pad_msg, pad_code) loss = nn.BCELoss() loss = loss(predict, labels) loss.backward() optimizer.step() steps += 1 if steps % params.log_interval == 0: print('\rEpoch: {} step: {} - loss: {:.6f}'.format( epoch, steps, loss.item())) print('Epoch: %i ---Training data' % (epoch)) acc, prc, rc, f1, auc_ = eval(data=batches_train, model=model) print( 'Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc_)) print('Epoch: %i ---Testing data' % (epoch)) acc, prc, rc, f1, auc_ = eval(data=batches_test, model=model) print( 'Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc_)) write_log.append( 'Epoch - testing: %i --- Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (epoch, acc, prc, rc, f1, auc_)) if epoch % 5 == 0: save(model, params.save_dir, 'epoch', epoch) write_file(params.save_dir + '/log.txt', write_log)
def train_confidnetnet_model(train, test, dictionary, params, options): ##################################################################################################### # training model using 50% of positive and 50% of negative data in mini batch ##################################################################################################### ids_train, labels_train, msg_train, code_train = train ids_test, labels_test, msg_test, code_test = test dict_msg, dict_code = dictionary print('Dictionary message: %i -- Dictionary code: %i' % (len(dict_msg), len(dict_code))) print('Training data') info_label(labels_train) pad_msg_train = padding_data(data=msg_train, dictionary=dict_msg, params=params, type='msg') pad_code_train = padding_data(data=code_train, dictionary=dict_code, params=params, type='code') print(pad_msg_train.shape, pad_code_train.shape) print('Testing data') info_label(labels_test) pad_msg_test = padding_data(data=msg_test, dictionary=dict_msg, params=params, type='msg') pad_code_test = padding_data(data=code_test, dictionary=dict_code, params=params, type='code') print(pad_msg_test.shape, pad_code_test.shape) # set up parameters params.cuda = (not params.no_cuda) and torch.cuda.is_available() del params.no_cuda params.filter_sizes = [int(k) for k in params.filter_sizes.split(',')] params.save_dir = os.path.join(params.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) params.vocab_msg, params.vocab_code = len(dict_msg), len(dict_code) if len(labels_train.shape) == 1: params.class_num = 1 else: params.class_num = labels_train.shape[1] params.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if options == 'clf': # create and train the defect model model = DefectNet(args=params) if torch.cuda.is_available(): model = model.cuda() model = freeze_layers(model=model, freeze_uncertainty_layers=True) # print('Training model with options', options) # for param in model.named_parameters(): # print(param[0], param[1].requires_grad) optimizer = torch.optim.Adam(model.parameters(), lr=params.l2_reg_lambda) steps = 0 batches_test = mini_batches(X_msg=pad_msg_test, X_code=pad_code_test, Y=labels_test) write_log = list() for epoch in range(1, params.num_epochs + 1): # building batches for training model batches_train = mini_batches_update(X_msg=pad_msg_train, X_code=pad_code_train, Y=labels_train) for batch in batches_train: pad_msg, pad_code, labels = batch if torch.cuda.is_available(): pad_msg, pad_code, labels = torch.tensor(pad_msg).cuda(), torch.tensor( pad_code).cuda(), torch.cuda.FloatTensor(labels) else: pad_msg, pad_code, labels = torch.tensor(pad_msg).long(), torch.tensor(pad_code).long(), torch.tensor( labels).float() optimizer.zero_grad() predict, uncertainty = model.forward(pad_msg, pad_code) loss = nn.BCELoss() loss = loss(predict, labels) loss.backward() optimizer.step() steps += 1 if steps % params.log_interval == 0: print('\rEpoch: {} step: {} - loss: {:.6f}'.format(epoch, steps, loss.item())) print('Epoch: %i ---Training data' % (epoch)) acc, prc, rc, f1, auc_ = evaluation_confidnet(data=batches_train, model=model) print('Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc_)) print('Epoch: %i ---Testing data' % (epoch)) acc, prc, rc, f1, auc_ = evaluation_confidnet(data=batches_test, model=model) print('Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc_)) write_log.append('Epoch - testing: %i --- Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (epoch, acc, prc, rc, f1, auc_)) if epoch % 5 == 0: save(model, params.save_dir, 'epoch', epoch) write_file(params.save_dir + '/log.txt', write_log) if options == 'confidnet': # create and train the defect model model = DefectNet(args=params) if torch.cuda.is_available(): model = model.cuda() if params.project == 'openstack': model.load_state_dict(torch.load('./snapshot/2020-05-17_09-37-57/epoch_55.pt'), strict=True) if params.project == 'qt': model.load_state_dict(torch.load('./snapshot/2020-05-17_12-50-56/epoch_15.pt'), strict=True) model = freeze_layers(model=model, freeze_uncertainty_layers=False) print('Training model with options', options) for param in model.named_parameters(): print(param[0], param[1].requires_grad) optimizer = torch.optim.Adam(model.parameters(), lr=params.l2_reg_lambda) steps = 0 batches_test = mini_batches(X_msg=pad_msg_test, X_code=pad_code_test, Y=labels_test) write_log = list() for epoch in range(1, params.num_epochs + 1): # building batches for training model batches_train = mini_batches_update(X_msg=pad_msg_train, X_code=pad_code_train, Y=labels_train) for batch in batches_train: pad_msg, pad_code, labels = batch if torch.cuda.is_available(): pad_msg, pad_code, labels = torch.tensor(pad_msg).cuda(), torch.tensor( pad_code).cuda(), torch.cuda.FloatTensor(labels) else: pad_msg, pad_code, labels = torch.tensor(pad_msg).long(), torch.tensor(pad_code).long(), torch.tensor( labels).float() optimizer.zero_grad() predict, uncertainty = model.forward(pad_msg, pad_code) loss = confid_mse_loss((predict, uncertainty), labels, args=params) loss.backward() optimizer.step() steps += 1 if steps % params.log_interval == 0: print('\rEpoch: {} step: {} - loss: {:.6f}'.format(epoch, steps, loss.item())) print('Epoch: %i ---Training data' % (epoch)) auc_ = evaluation_uncertainty(data=batches_train, model=model) print('AUC: %f' % (auc_)) print('Epoch: %i ---Testing data' % (epoch)) auc_ = evaluation_uncertainty(data=batches_test, model=model) print('AUC: %f' % (auc_)) write_log.append('Epoch - testing: %i --- AUC: %f' % (epoch, auc_)) if epoch % 5 == 0: save(model, params.save_dir, 'epoch', epoch) write_file(params.save_dir + '/log.txt', write_log)
def running_baseline_july(tf, folds, random_state): FLAGS = tf.flags.FLAGS commits_ = extract_commit_july(path_file=FLAGS.path) filter_commits = commits_ print len(commits_) kf = KFold(n_splits=folds, random_state=random_state) idx_folds = list() for train_index, test_index in kf.split(filter_commits): idx = dict() idx["train"], idx["test"] = train_index, test_index idx_folds.append(idx) if "msg" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) elif "all" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) all_lines = add_two_list(list1=msgs_, list2=codes_) msgs_ = all_lines elif "code" in FLAGS.model: msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits) msgs_ = codes_ else: print "You need to type correct model" exit() dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_) pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_) labels = load_label_commits(commits=filter_commits) labels = convert_to_binary(labels) print pad_msg.shape, labels.shape, len(dict_msg_) # exit() timestamp = str(int(time.time())) accuracy, precision, recall, f1, auc = list(), list(), list(), list(), list() cntfold = 0 pred_dict, pred_dict_prob = dict(), dict() for i in xrange(cntfold, len(idx_folds)): idx = idx_folds[i] train_index, test_index = idx["train"], idx["test"] X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \ np.array(get_items(items=pad_msg, indexes=test_index)) Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \ np.array(get_items(items=labels, indexes=test_index)) if FLAGS.model == "lstm_cnn_msg" or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "lstm_cnn_all": model = lstm_cnn(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg, y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS) elif FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code" or FLAGS.model == "cnn_all": model = cnn_model(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg, y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS) else: print "You need to give correct model name" exit() # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + ".h5") # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + "_testing.h5") # model.save("./keras_model/test_" + FLAGS.model + "_" + str(cntfold) + ".h5") model.save("./keras_model/newres_funcalls_" + FLAGS.model + "_" + str(cntfold) + ".h5") y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size) y_pred = np.ravel(y_pred) y_pred_tolist = y_pred.tolist() data_fold = [str(i) + "\t" + str(l) for i, l in zip(test_index, y_pred)] path_file = "./statistical_test/newres_funcalls_%s_fold_%s.txt" % (FLAGS.model, str(cntfold)) write_file(path_file=path_file, data=data_fold) y_pred[y_pred > 0.5] = 1 y_pred[y_pred <= 0.5] = 0 pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index)) accuracy.append(accuracy_score(y_true=Y_test, y_pred=y_pred)) precision.append(precision_score(y_true=Y_test, y_pred=y_pred)) recall.append(recall_score(y_true=Y_test, y_pred=y_pred)) f1.append(f1_score(y_true=Y_test, y_pred=y_pred)) auc.append(auc_score(y_true=Y_test, y_pred=y_pred)) print "accuracy", accuracy_score(y_true=Y_test, y_pred=y_pred) print "precision", precision_score(y_true=Y_test, y_pred=y_pred) print "recall", recall_score(y_true=Y_test, y_pred=y_pred) print "f1", f1_score(y_true=Y_test, y_pred=y_pred) cntfold += 1 break
def cross_validation_ver2(id, X, y, algorithm, folds): kf = KFold(n_splits=folds, random_state=None) kf.get_n_splits(X=X) accuracy, precision, recall, f1 = list(), list(), list(), list() probs = list() for train_index, test_index in kf.split(X): X_train, y_train = get_items(items=X, indexes=train_index), get_items( items=y, indexes=train_index) X_test, y_test = get_items(items=X, indexes=test_index), get_items( items=y, indexes=test_index) id_train, id_test = get_items( items=id, indexes=train_index), get_items(items=id, indexes=test_index) vectorizer = CountVectorizer() X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) # X = vectorizer.transform(X) # eval_train, eval_labels = loading_data("./data/3_mar7/typeaddres.out") # eval_train = vectorizer.transform(eval_train) if algorithm == "svm": clf = LinearSVC(random_state=0) elif algorithm == "lr": clf = LogisticRegression() elif algorithm == "dt": clf = DecisionTreeClassifier() else: print "Wrong algorithm name -- please retype again" exit() clf.fit(X=X_train, y=y_train) accuracy.append( accuracy_score(y_true=y_test, y_pred=clf.predict(X_test))) precision.append( precision_score(y_true=y_test, y_pred=clf.predict(X_test))) recall.append(recall_score(y_true=y_test, y_pred=clf.predict(X_test))) f1.append(f1_score(y_true=y_test, y_pred=clf.predict(X_test))) # print accuracy, precision, recall, f1 # print X_test.shape # y_pred = clf.predict(X_test) # y_pred_proba = clf.predict_proba(X_test)[:, 1] # y_pred_log_proba = clf.predict_log_proba(X_test) # print clf.predict_proba(X_test).shape # print clf.predict_log_proba(X_test).shape # exit() # probs += clf.predict_proba(X_test)[:, 1] probs = np.concatenate((probs, clf.predict_proba(X_test)[:, 1]), axis=0) # accuracy.append(accuracy_score(y_true=eval_labels, y_pred=clf.predict(eval_train))) # precision.append(precision_score(y_true=eval_labels, y_pred=clf.predict(eval_train))) # recall.append(recall_score(y_true=eval_labels, y_pred=clf.predict(eval_train))) # f1.append(f1_score(y_true=eval_labels, y_pred=clf.predict(eval_train))) # break print accuracy, "Accuracy of %s: %f" % (algorithm, avg_list(accuracy)) print precision, "Precision of %s: %f" % (algorithm, avg_list(precision)) print recall, "Recall of %s: %f" % (algorithm, avg_list(recall)) print f1, "F1 of %s: %f" % (algorithm, avg_list(f1)) path_write = "./statistical_test_prob/%s.txt" % (algorithm) write_file(path_file=path_write, data=probs) print len(probs)
auc.append(auc_score(y_true=Y_test, y_pred=y_pred)) # print "Accuracy of %s: %f" % (FLAGS.model, avg_list(accuracy)) # print "Precision of %s: %f" % (FLAGS.model, avg_list(precision)) # print "Recall of %s: %f" % (FLAGS.model, avg_list(recall)) # print "F1 of %s: %f" % (FLAGS.model, avg_list(f1)) # print "AUC of %s: %f" % (FLAGS.model, avg_list(auc)) # path_file = "./statistical_test/3_mar7/" + FLAGS.model + ".txt" # write_file(path_file, y_pred) # print "Accuracy of %s: %f" % (FLAGS.model, avg_list(accuracy)) # print "Precision of %s: %f" % (FLAGS.model, avg_list(precision)) # print "Recall of %s: %f" % (FLAGS.model, avg_list(recall)) # print "F1 of %s: %f" % (FLAGS.model, avg_list(f1)) # cntfold += 1 # exit() path_file = "./statistical_test_ver2/3_mar7/rerun_" + FLAGS.model + ".txt" write_file(path_file=path_file, data=sorted_dict(dict=pred_dict)) print accuracy, "Accuracy and std of %s: %f %f" % ( FLAGS.model, np.mean(np.array(accuracy)), np.std(np.array(accuracy))) print precision, "Precision of %s: %f %f" % ( FLAGS.model, np.mean(np.array(precision)), np.std(np.array(precision))) print recall, "Recall of %s: %f %f" % ( FLAGS.model, np.mean(np.array(recall)), np.std(np.array(recall))) print f1, "F1 of %s: %f %f" % (FLAGS.model, np.mean( np.array(f1)), np.std(np.array(f1))) print auc, "AUC of %s: %f %f" % (FLAGS.model, np.mean( np.array(auc)), np.std(np.array(auc))) print_params(tf) exit()
def print_label_data(path, name, commits): labels_data = [c["stable"] for c in commits] labels_data = [1 if "true" == l else 0 for l in labels_data] write_file(path_file=path + "/" + name, data=labels_data) return None
patchNet = load_probability_score(model="PatchNet", threshold=None) lstm_cnn = load_probability_score(model="LS-CNN", threshold=None) lpu_svm = load_probability_score(model="LPU-SVM", threshold=None) sasha = load_probability_score(model="sasha_results", threshold=50) print len(true_label), len(patchNet), len(lstm_cnn), len(lpu_svm), len( sasha) # # good commits can detect using patchNet # good_commit_id = [] # for i in xrange(0, len(true_label)): # # if true_label[i] == 1 and patchNet[i] == 1 and lstm_cnn[i] == 0 and lpu_svm[i] == 0 and sasha[i] == 0: # # good_commit_id.append(commits_id[i]) # if true_label[i] == 1 and patchNet[i] == 1 and sasha[i] == 0 and lstm_cnn[i] == 0 and lpu_svm[i] == 0: # good_commit_id.append(commits_id[i]) # # print len(good_commit_id) # path_write = "./statistical_test_prob_ver3/good_commits.txt" # write_file(path_file=path_write, data=good_commit_id) # # exit() bad_commit_id = [] for i in xrange(0, len(true_label)): if true_label[i] == 1 and patchNet[i] == 0 and (lpu_svm[i] == 1 or sasha[i] == 1): bad_commit_id.append(commits_id[i]) print len(bad_commit_id) # exit() path_write = "./statistical_test_prob_ver3/bad_commits.txt" write_file(path_file=path_write, data=bad_commit_id)
else: predict = model.forward(pad_msg, pad_code, pad_ftr).detach().numpy().tolist() all_predict += predict all_label += labels.tolist() acc, prc, rc, f1, auc_ = evaluation_metrics(y_pred=all_predict, y_true=all_label) print('Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc_)) return acc, prc, rc, f1, auc_ if __name__ == '__main__': project = 'openstack' # project = 'qt' training, testing, dictionary = loading_data(project=project) input_option = read_args().parse_args() input_help = read_args().print_help() input_option.filter_sizes = [int(k) for k in input_option.filter_sizes.split(',')] model, data_test, data_train = construct_model(data=(training, testing, dictionary), params=input_option) # input_option.start_epoch, input_option.end_epoch, input_option.step = 5, 50, 5 # input_option.datetime = '2019-01-17_17-15-05' results = list() for epoch in range(input_option.start_epoch, input_option.end_epoch, input_option.step): dir = './snapshot/' + input_option.datetime + '/epoch_' + str(epoch) + '.pt' print('--Epoch: %i' % epoch) acc, prc, rc, f1, auc_ = eval_dir(dir=dir, data=data_test, model=model) results.append('Epoch: %i -- Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (epoch, acc, prc, rc, f1, auc_)) path_save = './snapshot/' + input_option.datetime + '.txt' write_file(path_file=path_save, data=results)
return root_commit + '\t' + str(msg_length(c)) + '\t' + str(num_code_hunk(c)) \ + '\t' + str(num_code_line(c)) if __name__ == '__main__': path_data = './newres_funcalls_jul28.out.sorted.satisfy' commits_ = extract_commit_july(path_file=path_data) print len(commits_) # path_good_commits = './statistical_test_prob_ver3/good_commits.txt' # good_commits = load_file(path_file=path_good_commits) # print len(good_commits) # # patch_good_commits = [] # for c in good_commits: # print finding_patch_info(root_commit=c, commits=commits_) # patch_good_commits.append(finding_patch_info(root_commit=c, commits=commits_)) # write_file(path_file="./statistical_test_prob_ver3/good_commits_patchInfo.txt", data=patch_good_commits) path_good_commits = './statistical_test_prob_ver3/bad_commits.txt' good_commits = load_file(path_file=path_good_commits) print len(good_commits) patch_good_commits = [] for c in good_commits: print finding_patch_info(root_commit=c, commits=commits_) patch_good_commits.append( finding_patch_info(root_commit=c, commits=commits_)) write_file( path_file="./statistical_test_prob_ver3/ba_commits_patchInfo.txt", data=patch_good_commits)