예제 #1
0
def loading_data_all(FLAGS):
    # load all data from FLAGS path
    # split data to training and testing, only load testing data
    commits_ = extract_commit(path_file=FLAGS.path)
    filter_commits = filtering_commit(commits=commits_,
                                      num_file=FLAGS.code_file,
                                      num_hunk=FLAGS.code_hunk,
                                      num_loc=FLAGS.code_line,
                                      size_line=FLAGS.code_length)
    msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
        commits=filter_commits)
    dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
    pad_msg = mapping_commit_msg(msgs=msgs_,
                                 max_length=FLAGS.msg_length,
                                 dict_msg=dict_msg_)
    pad_added_code = mapping_commit_code(type="added",
                                         commits=filter_commits,
                                         max_hunk=FLAGS.code_hunk,
                                         max_code_line=FLAGS.code_line,
                                         max_code_length=FLAGS.code_length,
                                         dict_code=dict_code_)
    pad_removed_code = mapping_commit_code(type="removed",
                                           commits=filter_commits,
                                           max_hunk=FLAGS.code_hunk,
                                           max_code_line=FLAGS.code_line,
                                           max_code_length=FLAGS.code_length,
                                           dict_code=dict_code_)
    labels = load_label_commits(commits=filter_commits)
    return pad_msg, pad_added_code, pad_removed_code, labels
예제 #2
0
def loading_data(path_file):
    commits_ = extract_commit(path_file=path_data)
    nfile, nhunk, nline, nleng = 1, 8, 10, 120
    filter_commits = filtering_commit(commits=commits_,
                                      num_file=nfile,
                                      num_hunk=nhunk,
                                      num_loc=nline,
                                      size_line=nleng)
    msgs = extract_msg(commits=filter_commits)
    labels = extract_label(commits=filter_commits)
    codes = extract_code(commits=filter_commits)
    all_lines = add_two_list(list1=msgs, list2=codes)
    return all_lines, labels
def loading_training_data():
    path_ftr = "./data/3_mar7/new_features_ver1.txt"
    ids_, X_, y_ = load_data_ICSE(path=path_ftr)
    print len(ids_), X_.shape, y_.shape

    path_data = "./data/3_mar7/typediff.out"
    commits_ = extract_commit(path_file=path_data)
    nfile, nhunk, nline, nleng = 1, 8, 10, 120
    commits_ = get_commits(commits=filtering_commit(commits=commits_,
                                                    num_file=nfile,
                                                    num_hunk=nhunk,
                                                    num_loc=nline,
                                                    size_line=nleng), ids=ids_)
    return commits_, ids_, X_, y_
예제 #4
0
def loading_testing_data(FLAGS, path_file, type):
    if type == "msg":
        commits_ = extract_commit(path_file=path_file)
        filter_commits = filtering_commit(commits=commits_,
                                          num_file=FLAGS.code_file,
                                          num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
    elif type == "all":
        commits_ = extract_commit(path_file=path_file)
        filter_commits = filtering_commit(commits=commits_,
                                          num_file=FLAGS.code_file,
                                          num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
        all_lines = add_two_list(list1=msgs_, list2=codes_)
        msgs_ = all_lines

    elif type == "code":
        commits_ = extract_commit(path_file=path_file)
        filter_commits = filtering_commit(commits=commits_,
                                          num_file=FLAGS.code_file,
                                          num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
        msgs_ = codes_
    else:
        print "You need to type correct model"
        exit()
    return msgs_, codes_, filter_commits
예제 #5
0
def loading_data_lstm(FLAGS):
    print FLAGS.model
    if "msg" in FLAGS.model:
        commits_ = extract_commit(path_file=FLAGS.path)
        filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
        pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_)
        pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                             max_code_line=FLAGS.code_line,
                                             max_code_length=FLAGS.code_length, dict_code=dict_code_)
        pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                               max_code_line=FLAGS.code_line,
                                               max_code_length=FLAGS.code_length, dict_code=dict_code_)
        labels = load_label_commits(commits=filter_commits)
    elif "all" in FLAGS.model:
        commits_ = extract_commit(path_file=FLAGS.path)
        filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        all_lines = add_two_list(list1=msgs_, list2=codes_)
        msgs_ = all_lines
        dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
        pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_)
        pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                             max_code_line=FLAGS.code_line,
                                             max_code_length=FLAGS.code_length, dict_code=dict_code_)
        pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                               max_code_line=FLAGS.code_line,
                                               max_code_length=FLAGS.code_length, dict_code=dict_code_)
        labels = load_label_commits(commits=filter_commits)
    elif "code" in FLAGS.model:
        commits_ = extract_commit(path_file=FLAGS.path)
        filter_commits = filtering_commit(commits=commits_, num_file=FLAGS.code_file, num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(commits=filter_commits)
        msgs_ = codes_
        dict_msg_, dict_code_ = dictionary(data=msgs_), dictionary(data=codes_)
        pad_msg = mapping_commit_msg(msgs=msgs_, max_length=FLAGS.msg_length, dict_msg=dict_msg_)
        pad_added_code = mapping_commit_code(type="added", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                             max_code_line=FLAGS.code_line,
                                             max_code_length=FLAGS.code_length, dict_code=dict_code_)
        pad_removed_code = mapping_commit_code(type="removed", commits=filter_commits, max_hunk=FLAGS.code_hunk,
                                               max_code_line=FLAGS.code_line,
                                               max_code_length=FLAGS.code_length, dict_code=dict_code_)
        labels = load_label_commits(commits=filter_commits)
    else:
        print "You need to type correct model"
        exit()

    kf = KFold(n_splits=FLAGS.folds, random_state=FLAGS.seed)
    for train_index, test_index in kf.split(filter_commits):
        X_train_msg, X_test_msg = np.array(get_items(items=pad_msg, indexes=train_index)), \
                                  np.array(get_items(items=pad_msg, indexes=test_index))
        X_train_added_code, X_test_added_code = np.array(get_items(items=pad_added_code, indexes=train_index)), \
                                                np.array(get_items(items=pad_added_code, indexes=test_index))
        X_train_removed_code, X_test_removed_code = np.array(get_items(items=pad_removed_code, indexes=train_index)), \
                                                    np.array(get_items(items=pad_removed_code, indexes=test_index))
        y_train, y_test = np.array(get_items(items=labels, indexes=train_index)), \
                          np.array(get_items(items=labels, indexes=test_index))
        return X_test_msg, X_test_added_code, X_test_removed_code, y_test
예제 #6
0
    parser.add_argument('--data_type',
                        type=str,
                        default='all',
                        help='type of model for learning')
    parser.add_argument('--model',
                        type=str,
                        default='model',
                        help='names of our model')
    return parser


if __name__ == '__main__':
    input_option = read_args().parse_args()
    input_help = read_args().print_help()

    commits = extract_commit(path_file=input_option.data)
    commits = reformat_commit_code(commits=commits,
                                   num_file=1,
                                   num_hunk=input_option.code_hunk,
                                   num_loc=input_option.code_line,
                                   num_leng=input_option.code_length)

    if input_option.train is True:
        train_model(commits=commits, params=input_option)
        print '--------------------------------------------------------------------------------'
        print '--------------------------Finish the training process---------------------------'
        print '--------------------------------------------------------------------------------'
        exit()
    elif input_option.predict is True:
        predict_model(commits=commits, params=input_option)
        print '--------------------------------------------------------------------------------'
예제 #7
0
                                                    num_hunk=nhunk,
                                                    num_loc=nline,
                                                    size_line=nleng),
                           ids=ids_)
    return commits_, ids_, X_, y_


def clean_merging_data(ids, ftrs):
    ftr_id = [f.split(",")[0] for f in ftrs]
    new_ftr = [ftrs[ftr_id.index(i)] for i in ids]
    return new_ftr


if __name__ == "__main__":
    path_data = "./data/test_data/merging_markus_sasha.txt"
    commits_ = extract_commit(path_file=path_data)
    nfile, nhunk, nline, nleng = 1, 8, 10, 120
    filter_commits = filtering_commit(commits=commits_,
                                      num_file=nfile,
                                      num_hunk=nhunk,
                                      num_loc=nline,
                                      size_line=nleng)
    ids_ = [c["id"] for c in filter_commits]
    labels_ = [1 if c["stable"] == "true" else 0 for c in filter_commits]

    path_ftr = "./data/test_data/features_merging_markus_sasha.txt"
    ftr = load_file(path_file=path_ftr)
    new_ftr = clean_merging_data(ids=ids_, ftrs=ftr)

    commits_test, ids_test, X_ftr_test, y_test = loading_testing_data(
        ftr_data=new_ftr, commit_data=filter_commits)
예제 #8
0
                        default=None,
                        help='starting epoch of loading model')
    parser.add_argument('-end_epoch',
                        type=int,
                        default=None,
                        help='ending epoch of loading model')

    # load model name
    parser.add_argument('-file_model',
                        type=str,
                        default=None,
                        help='date of model [default: None]')
    return parser


if __name__ == '__main__':
    input_option = read_args().parse_args()
    input_help = read_args().print_help()

    path_file = './data/newres_funcalls_jul28.out.sorted'
    commits = extract_commit(path_file=path_file)
    commits = reformat_commit_code(commits=commits,
                                   num_file=1,
                                   num_hunk=8,
                                   num_loc=10,
                                   num_leng=120)
    nfolds, random_state = 5, None
    train, test = training_testing_split(commits=commits,
                                         nfolds=5,
                                         random_state=None)
    print(len(commits))
예제 #9
0
    clf.fit(X=X_train.toarray(), y=y_train)
    y_pred = clf.predict(X_test)
    path_write = "./data_test_data_pred_results/cnn_" + type + ".txt"
    write_file(path_file=path_write, data=y_pred)
    print "Accuracy: ", accuracy_score(y_true=y_test, y_pred=y_pred)
    print "Precision: ", precision_score(y_true=y_test, y_pred=y_pred)
    print "Recall: ", recall_score(y_true=y_test, y_pred=y_pred)
    print "F1: ", f1_score(y_true=y_test, y_pred=y_pred)
    print "AUC: ", auc_score(y_true=y_test, y_pred=y_pred)


if __name__ == "__main__":
    nfile, nhunk, nline, nleng = 1, 8, 10, 120

    path_data = "./data/3_mar7/typediff.out"
    commits_train = extract_commit(path_file=path_data)
    filter_commits_train = filtering_commit(commits=commits_train, num_file=nfile,
                                            num_hunk=nhunk, num_loc=nline,
                                            size_line=nleng)
    msgs_train = extract_msg(commits=filter_commits_train)
    labels_train = extract_label(commits=filter_commits_train)
    codes_train = extract_code(commits=filter_commits_train)
    all_lines_train = add_two_list(list1=msgs_train, list2=codes_train)

    # path_test = "./data/test_data/sasha_translated.out"
    path_test = "./data/test_data/merging_markus_sasha.txt"
    type = "all"
    # type = "msg"
    # type = "code"
    commits_test = extract_commit(path_file=path_test)
    filter_commits_test = filtering_commit(commits=commits_test,
예제 #10
0
    print "hello"


if __name__ == "__main__":
    tf = model_parameter_evaluation_keras()
    FLAGS = tf.flags.FLAGS
    print_params(tf)

    path_file_model = "./keras_model/"
    model_name = FLAGS.model
    # model_name = "lstm_code"
    model_name = "lstm_all"
    model = load_model(path_file_model + model_name + ".h5")

    if "msg" in FLAGS.model:
        commits_ = extract_commit(path_file=FLAGS.path)
        filter_commits = filtering_commit(commits=commits_,
                                          num_file=FLAGS.code_file,
                                          num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(
            commits=filter_commits)
    elif "all" in FLAGS.model:
        commits_ = extract_commit(path_file=FLAGS.path)
        filter_commits = filtering_commit(commits=commits_,
                                          num_file=FLAGS.code_file,
                                          num_hunk=FLAGS.code_hunk,
                                          num_loc=FLAGS.code_line,
                                          size_line=FLAGS.code_length)
        msgs_, codes_ = extract_msg(commits=filter_commits), extract_code(