示例#1
0
def baseline_testing(X_train, y_train, X_test, y_test, algorithm, type):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    print X_train.shape, X_test.shape

    if algorithm == "svm":
        clf = LinearSVC(random_state=0)
    elif algorithm == "lr":
        clf = LogisticRegression()
    elif algorithm == "dt":
        clf = DecisionTreeClassifier()
    elif algorithm == "nb":
        clf = GaussianNB()
    else:
        print "Wrong algorithm name -- please retype again"
        exit()

    clf.fit(X=X_train.toarray(), y=y_train)
    y_pred = clf.predict(X_test)
    path_write = "./data_test_data_pred_results/cnn_" + type + ".txt"
    write_file(path_file=path_write, data=y_pred)
    print "Accuracy: ", accuracy_score(y_true=y_test, y_pred=y_pred)
    print "Precision: ", precision_score(y_true=y_test, y_pred=y_pred)
    print "Recall: ", recall_score(y_true=y_test, y_pred=y_pred)
    print "F1: ", f1_score(y_true=y_test, y_pred=y_pred)
    print "AUC: ", auc_score(y_true=y_test, y_pred=y_pred)
示例#2
0
def qualitative_looking(path_correctId, path_label):
    id_root = load_file(path_file=path_correctId)
    print len(id_root)

    data_label = load_file(path_file=path_label)
    id_label = [d.split("\t")[0] for d in data_label]
    print len(data_label)

    index_id_root = [id_label.index(i) for i in id_root]

    for id_ in id_root:
        path_id = "./qualitative_analysis/cosine_sim/" + id_ + ".txt"
        cosine_data = load_file(path_file=path_id)
        print len(cosine_data)
        cosine_data = map(float, cosine_data)
        order_cosine = sorted(cosine_data, key=float, reverse=True)
        write_data = list()
        write_data = dict()
        for jid in index_id_root:
            name_id = id_label[jid]
            cosine_score = cosine_data[jid]
            position_ = order_cosine.index(cosine_score)
            # print name_id + "\t" + str(cosine_score) + "\t" + str(position_ + 1)
            # write_data.append(name_id + "\t" + str(cosine_score) + "\t" + str(position_ + 1))
            write_data[name_id] = position_ + 1

        new_write_data = list()
        for w in sorted(write_data, key=write_data.get):
            print w, write_data[w]
            new_write_data.append(w + "\t" + str(write_data[w]))

        path_write = "./qualitative_analysis/cosine_sim_order/" + id_ + ".txt"
        write_file(path_file=path_write, data=new_write_data)
示例#3
0
def get_commit_satisfy_condition(path_data_, nfile, nhunk, nline, nleng):
    commits_structure = extract_commit_july(path_file=path_data_)
    # nfile, nhunk, nline, nleng = 1, 8, 10, 120
    filter_commits = filtering_commit_union(commits=commits_structure,
                                            num_file=nfile,
                                            num_hunk=nhunk,
                                            num_loc=nline,
                                            size_line=nleng)
    print len(commits_structure), len(filter_commits)

    commits = load_file(path_data_)
    indexes = commits_index(commits=commits)
    new_commits = list()
    for i in xrange(0, len(indexes)):
        if i == len(indexes) - 1:
            id = commit_id(commit=commits[indexes[i]:])
            if id in filter_commits:
                new_commits += commits[indexes[i]:]
        else:
            id = commit_id(commit=commits[indexes[i]:indexes[i + 1]])
            if id in filter_commits:
                new_commits += commits[indexes[i]:indexes[i + 1]]
        print i, id
    # write_file("./satisfy_typediff_sorted.out", new_commits)
    write_file(path_data_ + ".satisfy", new_commits)
示例#4
0
def predict_model(commits, params):
    path_dict = os.path.abspath(os.path.join(os.path.curdir, params.model))
    dict_msg = load_dict_file(path_file=path_dict + '/dict_msg.txt')
    dict_code = load_dict_file(path_file=path_dict + '/dict_code.txt')

    pad_msg, pad_added_code, pad_removed_code, labels = padding_pred_commit(
        commits=commits, params=params, dict_msg=dict_msg, dict_code=dict_code)
    # print pad_msg.shape, pad_added_code.shape, pad_removed_code.shape, labels.shape
    checkpoint_dir = path_dict
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=params.allow_soft_placement,
            log_device_placement=params.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_msg = graph.get_operation_by_name("input_msg").outputs[0]
            input_addedcode = graph.get_operation_by_name(
                "input_addedcode").outputs[0]
            input_removedcode = graph.get_operation_by_name(
                "input_removedcode").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/scores").outputs[0]

            # Generate batches for one epoch
            batches = mini_batches(X_msg=pad_msg,
                                   X_added_code=pad_added_code,
                                   X_removed_code=pad_removed_code,
                                   Y=labels,
                                   mini_batch_size=params.batch_size)
            # Collect the predictions here
            commits_scores = list()

            for batch in batches:
                batch_input_msg, batch_input_added_code, batch_input_removed_code, batch_input_labels = batch
                batch_scores = sess.run(
                    scores, {
                        input_msg: batch_input_msg,
                        input_addedcode: batch_input_added_code,
                        input_removedcode: batch_input_removed_code,
                        dropout_keep_prob: 1.0
                    })
                batch_scores = np.ravel(softmax(batch_scores)[:, [1]])
                commits_scores = np.concatenate([commits_scores, batch_scores])
            write_file(
                path_file=os.path.abspath(os.path.join(os.path.curdir)) +
                '/prediction.txt',
                data=commits_scores)
示例#5
0
def print_false_negative(id, y_pred, threshold, y_true):
    y_pred = [1 if float(y) > threshold else 0 for y in y_pred]
    false_negative = []
    for i, p, t in zip(id, y_pred, y_true):
        if p == 0 and t == 1:
            false_negative.append(i)
    print len(false_negative)
    path_write = "./sasha_results/false_neg_%s.txt" % (str(threshold))
    write_file(path_file=path_write, data=false_negative)
示例#6
0
def print_true_positive(id, y_pred, threshold, y_true):
    y_pred = [1 if float(y) > threshold else 0 for y in y_pred]
    true_positive = []
    for i, p, t in zip(id, y_pred, y_true):
        if p == t and t == 1:
            true_positive.append(i)
    print len(true_positive)
    path_write = "./sasha_results/true_pos_%s.txt" % (str(threshold))
    write_file(path_file=path_write, data=true_positive)
def getting_overlap(commits, ids_ftr, data_ftr, path_name):
    new_ftr = list()
    for c in commits:
        if c["id"] in ids_ftr:
            index_ftr = ids_ftr.index(c["id"])
            label_ftr = c["stable"]
            line_ftr = data_ftr[index_ftr] + "," + label_ftr
            new_ftr.append(line_ftr)
    write_file(path_name, new_ftr)
    return new_ftr
def balance_data_ICSE(path):
    data = load_file(path_file=path)
    new_data, cnt = list(), 0
    for d in data:
        if "true" in d and cnt <= 11165:
            new_data.append(d.strip())
            cnt += 1
        elif "false" in d:
            new_data.append(d.strip())
    shuffle(new_data)
    write_file(path_file="./data/3_mar7/new_features_ver1.txt", data=new_data)
    exit()
示例#9
0
def cosine_similarity(path_root, id_commit, data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    data_root = load_file(path_file=path_root)
    for id_root in data_root:
        results = list()
        index_ = id_commit.index(id_root)
        X_root = X[index_, :].toarray().flatten()
        for i in xrange(len(id_commit)):
            results.append(
                1 -
                spatial.distance.cosine(X_root, X[i, :].toarray().flatten()))
        write_file(path_file="./qualitative_analysis/cosine_sim/" + id_root +
                   ".txt",
                   data=results)
    return None
示例#10
0
def restruct_root(roots, path, type):
    min_files = min([len(r) for r in roots])
    new_roots = [r[:min_files] for r in roots]
    print len(new_roots), len(new_roots[0])

    for i in xrange(0, len(new_roots[0])):
        model = list()
        for j in xrange(0, len(new_roots)):
            print path + "/" + new_roots[j][i]
            model += load_file(path + "/" + new_roots[j][i])
        model_name = "model-" + new_roots[j][i].split("-")[-1].replace(
            ".txt", "")
        print type, model_name
        # exit()
        path_write = "./patchNet_mergeResults/%s_%s.txt" % (type, model_name)
        write_file(path_file=path_write, data=model)
    return None
示例#11
0
def get_predict(name, X, y, algorithm, folds):
    kf = KFold(n_splits=folds, random_state=0)
    kf.get_n_splits(X=X)
    auc, accuracy, precision, recall, f1 = list(), list(), list(), list(), list()
    pred_dict = dict()
    for train_index, test_index in kf.split(X):
        X_train, y_train = get_items(items=X, indexes=train_index), get_items(items=y, indexes=train_index)
        X_test, y_test = get_items(items=X, indexes=test_index), get_items(items=y, indexes=test_index)

        vectorizer = CountVectorizer()
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)

        if algorithm == "svm":
            clf = LinearSVC(random_state=0)
        elif algorithm == "lr":
            clf = LogisticRegression()
        elif algorithm == "dt":
            clf = DecisionTreeClassifier()
        else:
            print "Wrong algorithm name -- please retype again"
            exit()

        clf.fit(X=X_train, y=y_train)
        y_pred = clf.predict(X_test)
        pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index))
        # path_file = "./statistical_test/" + name + "_" + algorithm + ".txt"
        # path_file = "./statistical_test/3_mar7/" + name + "_" + algorithm + ".txt"
        # path_file = "./statistical_test_ver2/3_mar7" + name + "_" + algorithm + ".txt"
        # write_file(path_file, y_pred)
        accuracy.append(accuracy_score(y_true=y_test, y_pred=y_pred))
        precision.append(precision_score(y_true=y_test, y_pred=y_pred))
        recall.append(recall_score(y_true=y_test, y_pred=y_pred))
        f1.append(f1_score(y_true=y_test, y_pred=y_pred))
        auc.append(auc_score(y_true=y_test, y_pred=y_pred))

    path_file = "./statistical_test_ver2/3_mar7/" + name + "_" + algorithm + ".txt"
    write_file(path_file=path_file, data=sorted_dict(dict=pred_dict))
    print "Accuracy and std of %s: %f %f" % (algorithm, np.mean(np.array(accuracy)), np.std(np.array(accuracy)))
    print "Precision of %s: %f %f" % (algorithm, np.mean(np.array(precision)), np.std(np.array(precision)))
    print "Recall of %s: %f %f" % (algorithm, np.mean(np.array(recall)), np.std(np.array(recall)))
    print "F1 of %s: %f %f" % (algorithm, np.mean(np.array(f1)), np.std(np.array(f1)))
    print "AUC of %s: %f %f" % (algorithm, np.mean(np.array(auc)), np.std(np.array(auc)))
def get_predict_ICSE_writePred(X_train, y_train, X_test, y_test,
                               algorithm, path_write):
    if algorithm == "svm":
        clf = LinearSVC()
    elif algorithm == "lr":
        clf = LogisticRegression()
    elif algorithm == "dt":
        clf = DecisionTreeClassifier()
    else:
        print "Wrong algorithm name -- please retype again"
        exit()

    clf.fit(X=X_train, y=y_train)
    y_pred = clf.predict(X_test)
    write_file(path_file=path_write, data=y_pred)
    print "Accuracy of %s: %f" % (algorithm, accuracy_score(y_true=y_test, y_pred=y_pred))
    print "Precision of %s: %f" % (algorithm, precision_score(y_true=y_test, y_pred=y_pred))
    print "Recall of %s: %f" % (algorithm, recall_score(y_true=y_test, y_pred=y_pred))
    print "F1 of %s: %f" % (algorithm, f1_score(y_true=y_test, y_pred=y_pred))
    print "AUC of %s: %f" % (algorithm, auc_score(y_true=y_test, y_pred=y_pred))
示例#13
0
def get_commit_id_and_date(path_data_):
    commits = load_file(path_data_)
    indexes = commits_index(commits=commits)
    dicts = {}
    for i in xrange(0, len(indexes)):
        if i == len(indexes) - 1:
            date = commit_date_july(commit=commits[indexes[i]:])
        else:
            date = commit_date_july(commit=commits[indexes[i]:indexes[i + 1]])
        dicts[i] = int(date)
    sort_dicts = sorted(dicts.items(), key=operator.itemgetter(1))
    new_commits = list()
    for d in sort_dicts:
        index, date = d[0], d[1]
        print index, date
        if index == len(sort_dicts) - 1:
            new_commits += commits[indexes[index]:]
        else:
            new_commits += commits[indexes[index]:indexes[index + 1]]
    # write_file("./typediff_sorted.out", new_commits)
    write_file(path_data_ + ".sorted", new_commits)
示例#14
0
def creating_sasha_data(path_data_, folds, random_state):
    commits_structure = extract_commit_july(path_file=path_data_)
    commits_id = [c["id"] for c in commits_structure]
    commits_label = ["stable" if c["stable"] == "true" else "nonstable" for c in commits_structure]
    commits_id_label = [id_ + "\t" + label_ for id_, label_ in zip(commits_id, commits_label)]

    kf = KFold(n_splits=folds, random_state=random_state)
    cnt_fold = 1
    for train_index, test_index in kf.split(commits_structure):
        train_id, train_label = get_elements(commits=commits_id, indexes=train_index), get_elements(
            commits=commits_label, indexes=train_index)
        test_id, test_label = get_elements(commits=commits_id, indexes=test_index), get_elements(
            commits=commits_label, indexes=test_index)
        train_file, test_file = get_elements(commits=commits_id_label, indexes=train_index), get_elements(
            commits=commits_id_label, indexes=test_index)
        print len(train_id), len(train_label)
        print len(test_id), len(test_label)
        print len(train_file), len(test_file)

        write_file(path_file="./sasha_data/fold" + str(cnt_fold) + "/" + "train.txt", data=train_file)
        write_file(path_file="./sasha_data/fold" + str(cnt_fold) + "/" + "test.txt", data=test_file)
        cnt_fold += 1
示例#15
0
def baseline_ver3(id, train, label, algorithm):
    X_train, y_train = train, label
    X_test, y_test = train, label
    id_train, id_test = id, id

    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    # X = vectorizer.transform(X)

    # eval_train, eval_labels = loading_data("./data/3_mar7/typeaddres.out")
    # eval_train = vectorizer.transform(eval_train)

    if algorithm == "svm":
        clf = LinearSVC(random_state=0)
    elif algorithm == "lr":
        clf = LogisticRegression()
    elif algorithm == "dt":
        clf = DecisionTreeClassifier()
    else:
        print "Wrong algorithm name -- please retype again"
        exit()

    clf.fit(X=X_train, y=y_train)
    accuracy = accuracy_score(y_true=y_test, y_pred=clf.predict(X_test))
    precision = precision_score(y_true=y_test, y_pred=clf.predict(X_test))
    recall = recall_score(y_true=y_test, y_pred=clf.predict(X_test))
    f1 = f1_score(y_true=y_test, y_pred=clf.predict(X_test))
    auc = auc_score(y_true=y_test, y_pred=clf.predict(X_test))

    print "Accuracy:", accuracy
    print "Precision:", precision
    print "Recall:", recall
    print "F1:", f1
    print "AUC:", auc

    probs = clf.predict_proba(X_test)[:, 1]
    path_write = "./statistical_test_ver2/%s.txt" % (algorithm)
    write_file(path_file=path_write, data=probs)
示例#16
0
        model = lstm_cnn(x_train=pad_msg_train,
                         y_train=Y_train,
                         x_test=pad_msg_test,
                         y_test=Y_test,
                         dictionary_size=len(dict_train),
                         FLAGS=FLAGS)
    elif name == "cnn_msg" or name == "cnn_code" or name == "cnn_all":
        model = cnn_model(x_train=pad_msg_train,
                          y_train=Y_train,
                          x_test=pad_msg_test,
                          y_test=Y_test,
                          dictionary_size=len(dict_train),
                          FLAGS=FLAGS)

    model.save("./lstm_model_ver3/" + name + ".h5")
    y_pred = model.predict(pad_msg_test, batch_size=FLAGS.batch_size)
    y_pred = np.ravel(y_pred)
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0

    path_file = "./data/test_data_pred_results/" + name + ".txt"
    write_file(path_file, y_pred)
    print "Accuracy of %s: %f" % (name,
                                  accuracy_score(y_true=Y_test, y_pred=y_pred))
    print "Precision of %s: %f" % (
        name, precision_score(y_true=Y_test, y_pred=y_pred))
    print "Recall of %s: %f" % (name, recall_score(y_true=Y_test,
                                                   y_pred=y_pred))
    print "F1 of %s: %f" % (name, f1_score(y_true=Y_test, y_pred=y_pred))
    print "AUC of %s: %f" % (name, f1_score(y_true=Y_test, y_pred=y_pred))
示例#17
0
def eval_patchNet_train_test(tf, checkpoint_dir, test):
    FLAGS = tf.flags.FLAGS
    allow_soft_placement = True  # "Allow device soft device placement"
    log_device_placement = False  # "Log placement of ops on devices"
    dirs = get_all_checkpoints(checkpoint_dir=checkpoint_dir)
    graph = tf.Graph()

    X_test_msg, X_test_added_code, X_test_removed_code, y_test = test[0], test[
        1], test[2], test[3]

    for checkpoint_file in dirs:
        with graph.as_default():
            session_conf = tf.ConfigProto(
                allow_soft_placement=allow_soft_placement,
                log_device_placement=log_device_placement)
            sess = tf.Session(config=session_conf)

            with sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)

                # Get the placeholders from the graph by name
                input_msg = graph.get_operation_by_name("input_msg").outputs[0]
                input_addedcode = graph.get_operation_by_name(
                    "input_addedcode").outputs[0]
                input_removedcode = graph.get_operation_by_name(
                    "input_removedcode").outputs[0]
                dropout_keep_prob = graph.get_operation_by_name(
                    "dropout_keep_prob").outputs[0]

                # Tensors we want to evaluate
                predictions = graph.get_operation_by_name(
                    "output/predictions").outputs[0]
                scores = graph.get_operation_by_name(
                    "output/scores").outputs[0]

                # Generate batches for one epoch
                batches = mini_batches(X_msg=X_test_msg,
                                       X_added_code=X_test_added_code,
                                       X_removed_code=X_test_removed_code,
                                       Y=y_test,
                                       mini_batch_size=FLAGS.batch_size)

                # Collect the predictions here
                all_predictions, all_scores = [], []

                for batch in batches:
                    batch_input_msg, batch_input_added_code, batch_input_removed_code, batch_input_labels = batch
                    batch_predictions = sess.run(
                        predictions, {
                            input_msg: batch_input_msg,
                            input_addedcode: batch_input_added_code,
                            input_removedcode: batch_input_removed_code,
                            dropout_keep_prob: 1.0
                        })
                    # print batch_predictions.shape
                    all_predictions = np.concatenate(
                        [all_predictions, batch_predictions])

                    batch_scores = sess.run(
                        scores, {
                            input_msg: batch_input_msg,
                            input_addedcode: batch_input_added_code,
                            input_removedcode: batch_input_removed_code,
                            dropout_keep_prob: 1.0
                        })
                    batch_scores = np.ravel(softmax(batch_scores)[:, [1]])
                    # print batch_scores.shape
                    all_scores = np.concatenate([all_scores, batch_scores])
        split_checkpoint_file = checkpoint_file.split("/")
        path_write = "./patchNet_results/%s_%s.txt" % (
            split_checkpoint_file[-3], split_checkpoint_file[-1])
        write_file(path_file=path_write, data=all_scores)
        print checkpoint_file, "Accuracy:", accuracy_score(
            y_true=convert_to_binary(y_test), y_pred=all_predictions)
        print checkpoint_file, "Precision:", precision_score(
            y_true=convert_to_binary(y_test), y_pred=all_predictions)
        print checkpoint_file, "Recall:", recall_score(
            y_true=convert_to_binary(y_test), y_pred=all_predictions)
        print checkpoint_file, "F1:", f1_score(
            y_true=convert_to_binary(y_test), y_pred=all_predictions)
        print checkpoint_file, "AUC:", auc_score(
            y_true=convert_to_binary(y_test), y_pred=all_predictions)
        print "\n"
示例#18
0
    print X.shape
    # exit()

    path_good_commits = "./statistical_test_prob_ver3/good_commits.txt"
    good_commits = load_file(path_file=path_good_commits)
    print "Leng of good commits: %s" % (str(len(good_commits)))

    write_data = []
    for g in good_commits:
        write_data += similarity_good_commit(id=commits_id,
                                             root=g,
                                             all=X,
                                             top_k=50)
        # break
    path_write = "./statistical_test_prob_ver2/good_commits_results.txt"
    write_file(path_file=path_write, data=write_data)
    # exit()
    ####################################################################################
    ####################################################################################
    path_bad_commits = "./statistical_test_prob_ver2/bad_commits.txt"
    bad_commits = load_file(path_file=path_bad_commits)
    print "Leng of bad commits: %s" % (str(len(bad_commits)))

    write_data = []
    for g in bad_commits:
        write_data += similarity_bad_commit(id=commits_id,
                                            root=g,
                                            all=X,
                                            top_k=75)
        # break
    path_write = "./statistical_test_prob_ver2/bad_commits_results.txt"
示例#19
0
    new_dict = {}
    for i in index_:
        new_dict[i] = dictionary[i]
    new_list = list()
    for key, value in sorted(new_dict.iteritems()):
        new_list.append(str(key) + ": " + value)
    return new_list


if __name__ == "__main__":
    path_data = "./data/3_mar7/typediff.out"
    commits_ = extract_commit(path_file=path_data)
    msgs = extract_msg(commits=commits_)
    codes = extract_code(commits=commits_)
    all_lines = add_two_list(list1=msgs, list2=codes)
    print len(all_lines), len(commits_), len(msgs), len(codes)
    index = create_dict(all_lines)
    print len(index)

    path_dict = "./data/3_mar7/newres.dict"
    dict_index = load_file(path_file=path_dict)
    new_dict = {}
    for d in dict_index:
        split_d = d.strip().split(":")
        new_dict[int(split_d[0])] = split_d[1]
    print len(new_dict)
    new_dict = mapping_dict(index_=index, dictionary=new_dict)
    path_write = "./data/3_mar7/newres.simplified.dict"
    write_file(path_file=path_write, data=new_dict)

示例#20
0
def loading_baseline_july(tf, folds, random_state):
    FLAGS = tf.flags.FLAGS
    commits_ = extract_commit_july(path_file=FLAGS.path)
    filter_commits = commits_
    print len(commits_)

    kf = KFold(n_splits=folds, random_state=random_state)
    idx_folds = list()
    for train_index, test_index in kf.split(filter_commits):
        idx = dict()
        idx["train"], idx["test"] = train_index, test_index
        idx_folds.append(idx)

    if "msg" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
    elif "all" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
        all_lines = add_two_list(list1=msgs_, list2=codes_)
        msgs_ = all_lines
    elif "code" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
        msgs_ = codes_
    else:
        print "You need to type correct model"
        exit()
    dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
    pad_msg = mapping_commit_msg(msgs=msgs_,
                                 max_length=FLAGS.msg_length,
                                 dict_msg=dict_msg_)
    labels = load_label_commits(commits=filter_commits)
    labels = convert_to_binary(labels)

    # path_file = "./statistical_test_prob/true_label.txt"
    # write_file(path_file=path_file, data=labels)
    # exit()

    print pad_msg.shape, labels.shape, len(dict_msg_)
    cntfold = 0
    pred_dict = dict()
    pred_dict_list = list()
    for i in xrange(cntfold, len(idx_folds)):
        idx = idx_folds[i]
        train_index, test_index = idx["train"], idx["test"]
        X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                                  np.array(get_items(items=pad_msg, indexes=test_index))
        Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \
                          np.array(get_items(items=labels, indexes=test_index))
        if FLAGS.model == "lstm_cnn_all" or FLAGS.model == "lstm_cnn_msg" \
                or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "cnn_all" \
                or FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code":
            # path_model = "./keras_model/%s_%s.h5" % (FLAGS.model, str(cntfold))
            path_model = "./keras_model/test_%s_%s.h5" % (FLAGS.model,
                                                          str(cntfold))
            # path_model = "./keras_model/%s_%s_testing.h5" % (FLAGS.model, str(cntfold))
            model = load_model(path_model)
        else:
            print "You need to give correct model name"
            exit()
        y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size)
        y_pred = np.ravel(y_pred)

        pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index))

        y_pred = y_pred.tolist()
        pred_dict_list += y_pred
    # print len(pred_dict_list)
    # exit()
    # path_file = "./statistical_test_prob/" + FLAGS.model + ".txt"
    # write_file(path_file=path_file, data=sorted_dict(dict=pred_dict))
    path_file = "./statistical_test_prob/" + FLAGS.model + "_checking.txt"
    write_file(path_file=path_file, data=pred_dict_list)
def train_model_mini_batches_update(train, test, dictionary, params):
    #####################################################################################################
    # training model using 50% of positive and 50% of negative data in mini batch
    #####################################################################################################
    ids_train, labels_train, msg_train, code_train = train
    ids_test, labels_test, msg_test, code_test = test
    dict_msg, dict_code = dictionary
    print('Dictionary message: %i -- Dictionary code: %i' %
          (len(dict_msg), len(dict_code)))
    print('Training data')
    info_label(labels_train)

    pad_msg_train = padding_data(data=msg_train,
                                 dictionary=dict_msg,
                                 params=params,
                                 type='msg')
    pad_code_train = padding_data(data=code_train,
                                  dictionary=dict_code,
                                  params=params,
                                  type='code')
    print('Testing data')
    info_label(labels_test)
    pad_msg_test = padding_data(data=msg_test,
                                dictionary=dict_msg,
                                params=params,
                                type='msg')
    pad_code_test = padding_data(data=code_test,
                                 dictionary=dict_code,
                                 params=params,
                                 type='code')

    # set up parameters
    params.cuda = (not params.no_cuda) and torch.cuda.is_available()
    del params.no_cuda
    params.filter_sizes = [int(k) for k in params.filter_sizes.split(',')]

    params.save_dir = os.path.join(
        params.save_dir,
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

    params.vocab_msg, params.vocab_code = len(dict_msg), len(dict_code)
    if len(labels_train.shape) == 1:
        params.class_num = 1
    else:
        params.class_num = labels_train.shape[1]
    params.device = torch.device(
        'cuda' if torch.cuda.is_available() else 'cpu')

    # create and train the defect model
    model = DefectNet(args=params)
    if torch.cuda.is_available():
        model = model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=params.l2_reg_lambda)
    steps = 0

    batches_test = mini_batches(X_msg=pad_msg_test,
                                X_code=pad_code_test,
                                Y=labels_test)
    write_log = list()
    for epoch in range(1, params.num_epochs + 1):
        # building batches for training model
        batches_train = mini_batches_update(X_msg=pad_msg_train,
                                            X_code=pad_code_train,
                                            Y=labels_train)
        for batch in batches_train:
            pad_msg, pad_code, labels = batch
            if torch.cuda.is_available():
                pad_msg, pad_code, labels = torch.tensor(
                    pad_msg).cuda(), torch.tensor(
                        pad_code).cuda(), torch.cuda.FloatTensor(labels)
            else:
                pad_msg, pad_code, labels = torch.tensor(pad_msg).long(
                ), torch.tensor(pad_code).long(), torch.tensor(labels).float()

            optimizer.zero_grad()
            ftr, predict = model.forward(pad_msg, pad_code)
            loss = nn.BCELoss()
            loss = loss(predict, labels)
            loss.backward()
            optimizer.step()

            steps += 1
            if steps % params.log_interval == 0:
                print('\rEpoch: {} step: {} - loss: {:.6f}'.format(
                    epoch, steps, loss.item()))

        print('Epoch: %i ---Training data' % (epoch))
        acc, prc, rc, f1, auc_ = eval(data=batches_train, model=model)
        print(
            'Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f'
            % (acc, prc, rc, f1, auc_))
        print('Epoch: %i ---Testing data' % (epoch))
        acc, prc, rc, f1, auc_ = eval(data=batches_test, model=model)
        print(
            'Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f'
            % (acc, prc, rc, f1, auc_))
        write_log.append(
            'Epoch - testing: %i --- Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f'
            % (epoch, acc, prc, rc, f1, auc_))
        if epoch % 5 == 0:
            save(model, params.save_dir, 'epoch', epoch)
    write_file(params.save_dir + '/log.txt', write_log)
示例#22
0
def train_confidnetnet_model(train, test, dictionary, params, options):
    #####################################################################################################
    # training model using 50% of positive and 50% of negative data in mini batch
    #####################################################################################################
    ids_train, labels_train, msg_train, code_train = train
    ids_test, labels_test, msg_test, code_test = test
    dict_msg, dict_code = dictionary
    print('Dictionary message: %i -- Dictionary code: %i' % (len(dict_msg), len(dict_code)))
    print('Training data')
    info_label(labels_train)

    pad_msg_train = padding_data(data=msg_train, dictionary=dict_msg, params=params, type='msg')
    pad_code_train = padding_data(data=code_train, dictionary=dict_code, params=params, type='code')
    print(pad_msg_train.shape, pad_code_train.shape)

    print('Testing data')
    info_label(labels_test)
    pad_msg_test = padding_data(data=msg_test, dictionary=dict_msg, params=params, type='msg')
    pad_code_test = padding_data(data=code_test, dictionary=dict_code, params=params, type='code')
    print(pad_msg_test.shape, pad_code_test.shape)

    # set up parameters
    params.cuda = (not params.no_cuda) and torch.cuda.is_available()
    del params.no_cuda
    params.filter_sizes = [int(k) for k in params.filter_sizes.split(',')]
    params.save_dir = os.path.join(params.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    params.vocab_msg, params.vocab_code = len(dict_msg), len(dict_code)
    if len(labels_train.shape) == 1:
        params.class_num = 1
    else:
        params.class_num = labels_train.shape[1]
    params.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if options == 'clf':
        # create and train the defect model
        model = DefectNet(args=params)
        if torch.cuda.is_available():
            model = model.cuda()

        model = freeze_layers(model=model, freeze_uncertainty_layers=True)

        # print('Training model with options', options)
        # for param in model.named_parameters():
        #     print(param[0], param[1].requires_grad)        

        optimizer = torch.optim.Adam(model.parameters(), lr=params.l2_reg_lambda)
        steps = 0

        batches_test = mini_batches(X_msg=pad_msg_test, X_code=pad_code_test, Y=labels_test)
        write_log = list()
        for epoch in range(1, params.num_epochs + 1):
            # building batches for training model
            batches_train = mini_batches_update(X_msg=pad_msg_train, X_code=pad_code_train, Y=labels_train)
            for batch in batches_train:
                pad_msg, pad_code, labels = batch
                if torch.cuda.is_available():
                    pad_msg, pad_code, labels = torch.tensor(pad_msg).cuda(), torch.tensor(
                        pad_code).cuda(), torch.cuda.FloatTensor(labels)
                else:
                    pad_msg, pad_code, labels = torch.tensor(pad_msg).long(), torch.tensor(pad_code).long(), torch.tensor(
                        labels).float()

                optimizer.zero_grad()
                predict, uncertainty = model.forward(pad_msg, pad_code)
                loss = nn.BCELoss()
                loss = loss(predict, labels)
                loss.backward()
                optimizer.step()

                steps += 1
                if steps % params.log_interval == 0:
                    print('\rEpoch: {} step: {} - loss: {:.6f}'.format(epoch, steps, loss.item()))

            print('Epoch: %i ---Training data' % (epoch))
            acc, prc, rc, f1, auc_ = evaluation_confidnet(data=batches_train, model=model)
            print('Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc_))
            print('Epoch: %i ---Testing data' % (epoch))
            acc, prc, rc, f1, auc_ = evaluation_confidnet(data=batches_test, model=model)
            print('Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc_))
            write_log.append('Epoch - testing: %i --- Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (epoch, acc, prc, rc, f1, auc_))
            if epoch % 5 == 0:
                save(model, params.save_dir, 'epoch', epoch)
        write_file(params.save_dir + '/log.txt', write_log)

    if options == 'confidnet':
        # create and train the defect model
        model = DefectNet(args=params)
        if torch.cuda.is_available():
            model = model.cuda()

        if params.project == 'openstack':
            model.load_state_dict(torch.load('./snapshot/2020-05-17_09-37-57/epoch_55.pt'), strict=True)
        if params.project == 'qt':
            model.load_state_dict(torch.load('./snapshot/2020-05-17_12-50-56/epoch_15.pt'), strict=True)

        model = freeze_layers(model=model, freeze_uncertainty_layers=False)
        
        print('Training model with options', options)
        for param in model.named_parameters():
            print(param[0], param[1].requires_grad)

        optimizer = torch.optim.Adam(model.parameters(), lr=params.l2_reg_lambda)
        steps = 0

        batches_test = mini_batches(X_msg=pad_msg_test, X_code=pad_code_test, Y=labels_test)
        write_log = list()
        for epoch in range(1, params.num_epochs + 1):
            # building batches for training model
            batches_train = mini_batches_update(X_msg=pad_msg_train, X_code=pad_code_train, Y=labels_train)
            for batch in batches_train:
                pad_msg, pad_code, labels = batch
                if torch.cuda.is_available():
                    pad_msg, pad_code, labels = torch.tensor(pad_msg).cuda(), torch.tensor(
                        pad_code).cuda(), torch.cuda.FloatTensor(labels)
                else:
                    pad_msg, pad_code, labels = torch.tensor(pad_msg).long(), torch.tensor(pad_code).long(), torch.tensor(
                        labels).float()

                optimizer.zero_grad()
                predict, uncertainty = model.forward(pad_msg, pad_code)
                loss = confid_mse_loss((predict, uncertainty), labels, args=params)
                loss.backward()
                optimizer.step()

                steps += 1
                if steps % params.log_interval == 0:
                    print('\rEpoch: {} step: {} - loss: {:.6f}'.format(epoch, steps, loss.item()))

            print('Epoch: %i ---Training data' % (epoch))
            auc_ = evaluation_uncertainty(data=batches_train, model=model)
            print('AUC: %f' % (auc_))
            print('Epoch: %i ---Testing data' % (epoch))
            auc_ = evaluation_uncertainty(data=batches_test, model=model)
            print('AUC: %f' % (auc_))
            write_log.append('Epoch - testing: %i --- AUC: %f' % (epoch, auc_))

            if epoch % 5 == 0:
                save(model, params.save_dir, 'epoch', epoch)
        write_file(params.save_dir + '/log.txt', write_log)
示例#23
0
def running_baseline_july(tf, folds, random_state):
    FLAGS = tf.flags.FLAGS
    commits_ = extract_commit_july(path_file=FLAGS.path)
    filter_commits = commits_
    print len(commits_)
    kf = KFold(n_splits=folds, random_state=random_state)
    idx_folds = list()
    for train_index, test_index in kf.split(filter_commits):
        idx = dict()
        idx["train"], idx["test"] = train_index, test_index
        idx_folds.append(idx)

    if "msg" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
    elif "all" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        all_lines = add_two_list(list1=msgs_, list2=codes_)
        msgs_ = all_lines
    elif "code" in FLAGS.model:
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        msgs_ = codes_
    else:
        print "You need to type correct model"
        exit()

    dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
    pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_)
    labels = load_label_commits(commits=filter_commits)
    labels = convert_to_binary(labels)
    print pad_msg.shape, labels.shape, len(dict_msg_)
    # exit()

    timestamp = str(int(time.time()))
    accuracy, precision, recall, f1, auc = list(), list(), list(), list(), list()
    cntfold = 0
    pred_dict, pred_dict_prob = dict(), dict()
    for i in xrange(cntfold, len(idx_folds)):
        idx = idx_folds[i]
        train_index, test_index = idx["train"], idx["test"]
        X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                                  np.array(get_items(items=pad_msg, indexes=test_index))
        Y_train, Y_test = np.array(get_items(items=labels, indexes=train_index)), \
                          np.array(get_items(items=labels, indexes=test_index))
        if FLAGS.model == "lstm_cnn_msg" or FLAGS.model == "lstm_cnn_code" or FLAGS.model == "lstm_cnn_all":
            model = lstm_cnn(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg,
                             y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS)
        elif FLAGS.model == "cnn_msg" or FLAGS.model == "cnn_code" or FLAGS.model == "cnn_all":
            model = cnn_model(x_train=X_train_msg, y_train=Y_train, x_test=X_test_msg,
                              y_test=Y_test, dictionary_size=len(dict_msg_), FLAGS=FLAGS)
        else:
            print "You need to give correct model name"
            exit()

        # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + ".h5")
        # model.save("./keras_model/" + FLAGS.model + "_" + str(cntfold) + "_testing.h5")
        # model.save("./keras_model/test_" + FLAGS.model + "_" + str(cntfold) + ".h5")
        model.save("./keras_model/newres_funcalls_" + FLAGS.model + "_" + str(cntfold) + ".h5")

        y_pred = model.predict(X_test_msg, batch_size=FLAGS.batch_size)
        y_pred = np.ravel(y_pred)

        y_pred_tolist = y_pred.tolist()
        data_fold = [str(i) + "\t" + str(l) for i, l in zip(test_index, y_pred)]
        path_file = "./statistical_test/newres_funcalls_%s_fold_%s.txt" % (FLAGS.model, str(cntfold))
        write_file(path_file=path_file, data=data_fold)

        y_pred[y_pred > 0.5] = 1
        y_pred[y_pred <= 0.5] = 0

        pred_dict.update(make_dictionary(y_pred=y_pred, y_index=test_index))
        accuracy.append(accuracy_score(y_true=Y_test, y_pred=y_pred))
        precision.append(precision_score(y_true=Y_test, y_pred=y_pred))
        recall.append(recall_score(y_true=Y_test, y_pred=y_pred))
        f1.append(f1_score(y_true=Y_test, y_pred=y_pred))
        auc.append(auc_score(y_true=Y_test, y_pred=y_pred))
        print "accuracy", accuracy_score(y_true=Y_test, y_pred=y_pred)
        print "precision", precision_score(y_true=Y_test, y_pred=y_pred)
        print "recall", recall_score(y_true=Y_test, y_pred=y_pred)
        print "f1", f1_score(y_true=Y_test, y_pred=y_pred)

        cntfold += 1
        break
示例#24
0
def cross_validation_ver2(id, X, y, algorithm, folds):
    kf = KFold(n_splits=folds, random_state=None)
    kf.get_n_splits(X=X)
    accuracy, precision, recall, f1 = list(), list(), list(), list()
    probs = list()
    for train_index, test_index in kf.split(X):
        X_train, y_train = get_items(items=X, indexes=train_index), get_items(
            items=y, indexes=train_index)
        X_test, y_test = get_items(items=X, indexes=test_index), get_items(
            items=y, indexes=test_index)
        id_train, id_test = get_items(
            items=id, indexes=train_index), get_items(items=id,
                                                      indexes=test_index)

        vectorizer = CountVectorizer()
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        # X = vectorizer.transform(X)

        # eval_train, eval_labels = loading_data("./data/3_mar7/typeaddres.out")
        # eval_train = vectorizer.transform(eval_train)

        if algorithm == "svm":
            clf = LinearSVC(random_state=0)
        elif algorithm == "lr":
            clf = LogisticRegression()
        elif algorithm == "dt":
            clf = DecisionTreeClassifier()
        else:
            print "Wrong algorithm name -- please retype again"
            exit()

        clf.fit(X=X_train, y=y_train)
        accuracy.append(
            accuracy_score(y_true=y_test, y_pred=clf.predict(X_test)))
        precision.append(
            precision_score(y_true=y_test, y_pred=clf.predict(X_test)))
        recall.append(recall_score(y_true=y_test, y_pred=clf.predict(X_test)))
        f1.append(f1_score(y_true=y_test, y_pred=clf.predict(X_test)))
        # print accuracy, precision, recall, f1

        # print X_test.shape
        # y_pred = clf.predict(X_test)
        # y_pred_proba = clf.predict_proba(X_test)[:, 1]
        # y_pred_log_proba = clf.predict_log_proba(X_test)
        # print clf.predict_proba(X_test).shape
        # print clf.predict_log_proba(X_test).shape
        # exit()
        # probs += clf.predict_proba(X_test)[:, 1]
        probs = np.concatenate((probs, clf.predict_proba(X_test)[:, 1]),
                               axis=0)

        # accuracy.append(accuracy_score(y_true=eval_labels, y_pred=clf.predict(eval_train)))
        # precision.append(precision_score(y_true=eval_labels, y_pred=clf.predict(eval_train)))
        # recall.append(recall_score(y_true=eval_labels, y_pred=clf.predict(eval_train)))
        # f1.append(f1_score(y_true=eval_labels, y_pred=clf.predict(eval_train)))
        # break

    print accuracy, "Accuracy of %s: %f" % (algorithm, avg_list(accuracy))
    print precision, "Precision of %s: %f" % (algorithm, avg_list(precision))
    print recall, "Recall of %s: %f" % (algorithm, avg_list(recall))
    print f1, "F1 of %s: %f" % (algorithm, avg_list(f1))

    path_write = "./statistical_test_prob/%s.txt" % (algorithm)
    write_file(path_file=path_write, data=probs)
    print len(probs)
示例#25
0
        auc.append(auc_score(y_true=Y_test, y_pred=y_pred))

        # print "Accuracy of %s: %f" % (FLAGS.model, avg_list(accuracy))
        # print "Precision of %s: %f" % (FLAGS.model, avg_list(precision))
        # print "Recall of %s: %f" % (FLAGS.model, avg_list(recall))
        # print "F1 of %s: %f" % (FLAGS.model, avg_list(f1))
        # print "AUC of %s: %f" % (FLAGS.model, avg_list(auc))

        # path_file = "./statistical_test/3_mar7/" + FLAGS.model + ".txt"
        # write_file(path_file, y_pred)
        # print "Accuracy of %s: %f" % (FLAGS.model, avg_list(accuracy))
        # print "Precision of %s: %f" % (FLAGS.model, avg_list(precision))
        # print "Recall of %s: %f" % (FLAGS.model, avg_list(recall))
        # print "F1 of %s: %f" % (FLAGS.model, avg_list(f1))
        # cntfold += 1
        # exit()
    path_file = "./statistical_test_ver2/3_mar7/rerun_" + FLAGS.model + ".txt"
    write_file(path_file=path_file, data=sorted_dict(dict=pred_dict))
    print accuracy, "Accuracy and std of %s: %f %f" % (
        FLAGS.model, np.mean(np.array(accuracy)), np.std(np.array(accuracy)))
    print precision, "Precision of %s: %f %f" % (
        FLAGS.model, np.mean(np.array(precision)), np.std(np.array(precision)))
    print recall, "Recall of %s: %f %f" % (
        FLAGS.model, np.mean(np.array(recall)), np.std(np.array(recall)))
    print f1, "F1 of %s: %f %f" % (FLAGS.model, np.mean(
        np.array(f1)), np.std(np.array(f1)))
    print auc, "AUC of %s: %f %f" % (FLAGS.model, np.mean(
        np.array(auc)), np.std(np.array(auc)))
    print_params(tf)
    exit()
示例#26
0
def print_label_data(path, name, commits):
    labels_data = [c["stable"] for c in commits]
    labels_data = [1 if "true" == l else 0 for l in labels_data]
    write_file(path_file=path + "/" + name, data=labels_data)
    return None
示例#27
0
    patchNet = load_probability_score(model="PatchNet", threshold=None)
    lstm_cnn = load_probability_score(model="LS-CNN", threshold=None)
    lpu_svm = load_probability_score(model="LPU-SVM", threshold=None)
    sasha = load_probability_score(model="sasha_results", threshold=50)
    print len(true_label), len(patchNet), len(lstm_cnn), len(lpu_svm), len(
        sasha)

    # # good commits can detect using patchNet
    # good_commit_id = []
    # for i in xrange(0, len(true_label)):
    #     # if true_label[i] == 1 and patchNet[i] == 1 and lstm_cnn[i] == 0 and lpu_svm[i] == 0 and sasha[i] == 0:
    #     #     good_commit_id.append(commits_id[i])
    #     if true_label[i] == 1 and patchNet[i] == 1 and sasha[i] == 0 and lstm_cnn[i] == 0 and lpu_svm[i] == 0:
    #         good_commit_id.append(commits_id[i])
    # # print len(good_commit_id)
    # path_write = "./statistical_test_prob_ver3/good_commits.txt"
    # write_file(path_file=path_write, data=good_commit_id)
    # # exit()

    bad_commit_id = []
    for i in xrange(0, len(true_label)):
        if true_label[i] == 1 and patchNet[i] == 0 and (lpu_svm[i] == 1
                                                        or sasha[i] == 1):
            bad_commit_id.append(commits_id[i])

    print len(bad_commit_id)
    # exit()

    path_write = "./statistical_test_prob_ver3/bad_commits.txt"
    write_file(path_file=path_write, data=bad_commit_id)
        else:
            predict = model.forward(pad_msg, pad_code, pad_ftr).detach().numpy().tolist()
        all_predict += predict
        all_label += labels.tolist()
    acc, prc, rc, f1, auc_ = evaluation_metrics(y_pred=all_predict, y_true=all_label)
    print('Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc_))
    return acc, prc, rc, f1, auc_


if __name__ == '__main__':
    project = 'openstack'
    # project = 'qt'
    training, testing, dictionary = loading_data(project=project)

    input_option = read_args().parse_args()
    input_help = read_args().print_help()
    input_option.filter_sizes = [int(k) for k in input_option.filter_sizes.split(',')]

    model, data_test, data_train = construct_model(data=(training, testing, dictionary), params=input_option)
    # input_option.start_epoch, input_option.end_epoch, input_option.step = 5, 50, 5
    # input_option.datetime = '2019-01-17_17-15-05'
    results = list()
    for epoch in range(input_option.start_epoch, input_option.end_epoch, input_option.step):
        dir = './snapshot/' + input_option.datetime + '/epoch_' + str(epoch) + '.pt'
        print('--Epoch: %i' % epoch)
        acc, prc, rc, f1, auc_ = eval_dir(dir=dir, data=data_test, model=model)
        results.append('Epoch: %i -- Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f'
                       % (epoch, acc, prc, rc, f1, auc_))
    path_save = './snapshot/' + input_option.datetime + '.txt'
    write_file(path_file=path_save, data=results)
            return root_commit + '\t' + str(msg_length(c)) + '\t' + str(num_code_hunk(c)) \
                   + '\t' + str(num_code_line(c))


if __name__ == '__main__':
    path_data = './newres_funcalls_jul28.out.sorted.satisfy'
    commits_ = extract_commit_july(path_file=path_data)
    print len(commits_)

    # path_good_commits = './statistical_test_prob_ver3/good_commits.txt'
    # good_commits = load_file(path_file=path_good_commits)
    # print len(good_commits)
    #
    # patch_good_commits = []
    # for c in good_commits:
    #     print finding_patch_info(root_commit=c, commits=commits_)
    #     patch_good_commits.append(finding_patch_info(root_commit=c, commits=commits_))
    # write_file(path_file="./statistical_test_prob_ver3/good_commits_patchInfo.txt", data=patch_good_commits)

    path_good_commits = './statistical_test_prob_ver3/bad_commits.txt'
    good_commits = load_file(path_file=path_good_commits)
    print len(good_commits)

    patch_good_commits = []
    for c in good_commits:
        print finding_patch_info(root_commit=c, commits=commits_)
        patch_good_commits.append(
            finding_patch_info(root_commit=c, commits=commits_))
    write_file(
        path_file="./statistical_test_prob_ver3/ba_commits_patchInfo.txt",
        data=patch_good_commits)