Exemplo n.º 1
0
def copy_distance_and_action_main():
    db_path = r'/home/lf/Project/ProgramFix/data/slk_sample_data.db'
    con = sqlite3.connect(db_path)

    sample_train = read_data(con, 'slk_sample_common_c_error_records_train')
    sample_valid = read_data(con, 'slk_sample_common_c_error_records_valid')
    sample_test = read_data(con, 'slk_sample_common_c_error_records_test')
    print(len(sample_train))
    print(len(sample_valid))
    print(len(sample_test))

    train, vaild, test = read_fake_common_c_error_dataset_with_limit_length(MAX_TOKEN_LENGTH)

    # train = train.sample(100)
    # vaild = vaild.sample(100)
    # test = test.sample(100)

    merge_train = pd.merge(sample_train, train, on=['id'])
    merge_valid = pd.merge(sample_valid, vaild, on=['id'])
    merge_test = pd.merge(sample_test, test, on=['id'])

    print(len(merge_train))
    print(len(merge_valid))
    print(len(merge_test))

    update_column(merge_train, 'slk_sample_common_c_error_records_train', con)
    update_column(merge_valid, 'slk_sample_common_c_error_records_valid', con)
    update_column(merge_test, 'slk_sample_common_c_error_records_test', con)
def read_modify_action_token():
    train_df, _, _ = read_fake_common_c_error_dataset_with_limit_length(500)
    train_df['modify_action_list'] = train_df['modify_action_list'].map(
        json.loads)
    extract_to_token_fn = lambda actions: [act['to_char'] for act in actions]
    act_tokens = [
        extract_to_token_fn(actions)
        for actions in train_df['modify_action_list']
    ]
    return act_tokens
def read_filter_without_include_ac_token():
    train_df, _, _ = read_fake_common_c_error_dataset_with_limit_length(500)
    transform_lextoken_to_token_fn = lambda token_list: [
        i.value for i in token_list
    ]
    tokenize_fn = tokenize_by_clex_fn()
    parse_tokens = [
        transform_lextoken_to_token_fn(tokenize_fn(code))
        for code in train_df['similar_code']
    ]
    return parse_tokens
Exemplo n.º 4
0
def load_common_error_data_sample_with_encoder_copy_100(
        inner_begin_id, inner_end_id):
    vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'],
                                           end_tokens=['<END>'],
                                           unk_token='<UNK>',
                                           addition_tokens=['<GAP>'])
    train, vaild, test = read_fake_common_c_error_dataset_with_limit_length(
        MAX_TOKEN_LENGTH)
    train = convert_c_code_fields_to_cpp_fields(train)
    vaild = convert_c_code_fields_to_cpp_fields(vaild)
    test = convert_c_code_fields_to_cpp_fields(test)

    train = train.sample(100)
    vaild = vaild.sample(100)
    test = test.sample(100)

    tokenize_fn = tokenize_by_clex_fn()

    parse_param = [
        vocab, action_list_sorted, tokenize_fn, inner_begin_id, inner_end_id
    ]
    parse_test_param = [vocab, tokenize_fn]

    train_data = parse_error_tokens_and_action_map_encoder_copy(
        train, 'train', *parse_param)
    vaild_data = parse_error_tokens_and_action_map_encoder_copy(
        vaild, 'valid', *parse_param)
    test_data = parse_error_tokens_and_action_map_encoder_copy(
        test, 'test', *parse_param)
    # vaild_data = parse_test_tokens(vaild, 'valid', *parse_test_param)
    # test_data = parse_test_tokens(test, 'test', *parse_test_param)

    train = train.loc[train_data[0].index.values]
    vaild = vaild.loc[vaild_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_code_word_id': train_data[0],
        'ac_code_word_id': train_data[1],
        'token_map': train_data[2],
        'error_mask': train_data[3],
        'includes': train['includes'],
        'is_copy': train_data[4],
        'distance': train_data[5],
        'ac_code_target_id': train_data[6],
        'ac_code_target': train_data[7]
    }
    valid_dict = {
        'error_code_word_id': vaild_data[0],
        'ac_code_word_id': vaild_data[1],
        'token_map': vaild_data[2],
        'error_mask': vaild_data[3],
        'includes': vaild['includes'],
        'is_copy': vaild_data[4],
        'distance': vaild_data[5],
        'ac_code_target_id': vaild_data[6],
        'ac_code_target': vaild_data[7]
    }
    test_dict = {
        'error_code_word_id': test_data[0],
        'ac_code_word_id': test_data[1],
        'token_map': test_data[2],
        'error_mask': test_data[3],
        'includes': test['includes'],
        'is_copy': test_data[4],
        'distance': test_data[5],
        'ac_code_target_id': test_data[6],
        'ac_code_target': test_data[7]
    }
    # valid_dict = {'error_code_word_id': vaild_data, 'includes': vaild['includes']}
    # test_dict = {'error_code_word_id': test_data, 'includes': test['includes']}

    # train_data_set = CCodeErrorDataSet(pd.DataFrame(train_dict), vocab, 'train')
    # valid_data_set = CCodeErrorDataSet(pd.DataFrame(valid_dict), vocab, 'all_valid')
    # test_data_set = CCodeErrorDataSet(pd.DataFrame(test_dict), vocab, 'all_test')
    # print(train_data[0])

    return train_dict, valid_dict, test_dict
Exemplo n.º 5
0
def load_common_error_dataset_iterate_error_data_100(do_flatten=False,
                                                     merge_action=True,
                                                     sequence_output=False):
    vocab = create_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])

    train, valid, test = read_fake_common_c_error_dataset_with_limit_length(
        500)

    train = convert_c_code_fields_to_cpp_fields(train, convert_include=True)
    valid = convert_c_code_fields_to_cpp_fields(valid, convert_include=True)
    test = convert_c_code_fields_to_cpp_fields(test, convert_include=True)

    train = train.sample(100)
    valid = valid.sample(100)
    test = test.sample(100)

    tokenize_fn = tokenize_by_clex_fn()
    parse_fn = parse_iterative_sample_action_error_code
    parse_param = [
        vocab, action_list_sorted_no_reverse, tokenize_fn, merge_action,
        sequence_output
    ]

    train_data = parse_fn(train, 'train', *parse_param)
    valid_data = parse_fn(valid, 'valid', *parse_param)
    test_data = parse_fn(test, 'test', *parse_param)

    train = train.loc[train_data[0].index.values]
    valid = valid.loc[valid_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_token_id_list': train_data[0],
        'sample_error_id_list': train_data[1],
        'sample_ac_id_list': train_data[2],
        'ac_pos_list': train_data[3],
        'error_pos_list': train_data[4],
        'includes': train['includes'],
        'distance': train['distance'],
        'ac_code_ids': train_data[5],
        'is_copy_list': train_data[6],
        'copy_pos_list': train_data[7],
        'sample_mask_list': train_data[8],
        'error_token_name_list': train_data[9],
        'id': train['id'],
        'target_ac_token_id_list': train_data[10],
        'ac_code_name_with_labels': train_data[11]
    }
    valid_dict = {
        'error_token_id_list': valid_data[0],
        'sample_error_id_list': valid_data[1],
        'sample_ac_id_list': valid_data[2],
        'ac_pos_list': valid_data[3],
        'error_pos_list': valid_data[4],
        'includes': valid['includes'],
        'distance': valid['distance'],
        'ac_code_ids': valid_data[5],
        'is_copy_list': valid_data[6],
        'copy_pos_list': valid_data[7],
        'sample_mask_list': valid_data[8],
        'error_token_name_list': valid_data[9],
        'id': valid['id'],
        'target_ac_token_id_list': valid_data[10],
        'ac_code_name_with_labels': valid_data[11]
    }
    test_dict = {
        'error_token_id_list': test_data[0],
        'sample_error_id_list': test_data[1],
        'sample_ac_id_list': test_data[2],
        'ac_pos_list': test_data[3],
        'error_pos_list': test_data[4],
        'includes': test['includes'],
        'distance': test['distance'],
        'ac_code_ids': test_data[5],
        'is_copy_list': test_data[6],
        'copy_pos_list': test_data[7],
        'sample_mask_list': test_data[8],
        'error_token_name_list': test_data[9],
        'id': test['id'],
        'target_ac_token_id_list': test_data[10],
        'ac_code_name_with_labels': test_data[11]
    }

    if do_flatten:
        train_dict = flatten_iterative_data(train_dict)
        valid_dict = flatten_iterative_data(valid_dict)
        test_dict = flatten_iterative_data(test_dict)

    return train_dict, valid_dict, test_dict
Exemplo n.º 6
0
def load_common_error_data(addition_infomation=False, data_type=None):
    if data_type == 'deepfix':
        vocab = create_deepfix_common_error_vocabulary(
            begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
            end_tokens=['<END>', '<INNER_END>'],
            unk_token='<UNK>',
            addition_tokens=['<PAD>'])

        train, vaild, test = read_fake_common_deepfix_error_dataset_with_limit_length(
            500)
        train = convert_c_code_fields_to_cpp_fields(train,
                                                    convert_include=False)
        vaild = convert_c_code_fields_to_cpp_fields(vaild,
                                                    convert_include=False)
        test = convert_c_code_fields_to_cpp_fields(test, convert_include=False)
    else:
        vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'],
                                               end_tokens=['<END>'],
                                               unk_token='<UNK>',
                                               addition_tokens=['<GAP>'])
        train, vaild, test = read_fake_common_c_error_dataset_with_limit_length(
            MAX_TOKEN_LENGTH)
        train = convert_c_code_fields_to_cpp_fields(train)
        vaild = convert_c_code_fields_to_cpp_fields(vaild)
        test = convert_c_code_fields_to_cpp_fields(test)

    tokenize_fn = tokenize_by_clex_fn()

    parse_param = [vocab, action_list_sorted, tokenize_fn]
    parse_test_param = [vocab, tokenize_fn]

    train_data = parse_error_tokens_and_action_map(train, 'train',
                                                   *parse_param)
    vaild_data = parse_error_tokens_and_action_map(vaild, 'valid',
                                                   *parse_param)
    test_data = parse_error_tokens_and_action_map(test, 'test', *parse_param)
    # vaild_data = parse_test_tokens(vaild, 'valid', *parse_test_param)
    # test_data = parse_test_tokens(test, 'test', *parse_test_param)

    train = train.loc[train_data[0].index.values]
    vaild = vaild.loc[vaild_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_code_word_id': train_data[0],
        'ac_code_word_id': train_data[1],
        'token_map': train_data[2],
        'error_mask': train_data[3],
        'includes': train['includes'],
        'is_copy': train_data[4],
        'pointer_map': train_data[5],
        'distance': train_data[6],
        'error_code_word': train_data[7]
    }
    valid_dict = {
        'error_code_word_id': vaild_data[0],
        'ac_code_word_id': vaild_data[1],
        'token_map': vaild_data[2],
        'error_mask': vaild_data[3],
        'includes': vaild['includes'],
        'is_copy': vaild_data[4],
        'pointer_map': vaild_data[5],
        'distance': vaild_data[6],
        'error_code_word': vaild_data[7]
    }
    test_dict = {
        'error_code_word_id': test_data[0],
        'ac_code_word_id': test_data[1],
        'token_map': test_data[2],
        'error_mask': test_data[3],
        'includes': test['includes'],
        'is_copy': test_data[4],
        'pointer_map': test_data[5],
        'distance': test_data[6],
        'error_code_word': test_data[7]
    }

    if addition_infomation:
        train_dict = add_c_common_code_original_info(data_dict=train_dict,
                                                     df=train)
        valid_dict = add_c_common_code_original_info(data_dict=valid_dict,
                                                     df=vaild)
        test_dict = add_c_common_code_original_info(data_dict=test_dict,
                                                    df=test)

    # valid_dict = {'error_code_word_id': vaild_data, 'includes': vaild['includes']}
    # test_dict = {'error_code_word_id': test_data, 'includes': test['includes']}

    # train_data_set = CCodeErrorDataSet(pd.DataFrame(train_dict), vocab, 'train')
    # valid_data_set = CCodeErrorDataSet(pd.DataFrame(valid_dict), vocab, 'all_valid')
    # test_data_set = CCodeErrorDataSet(pd.DataFrame(test_dict), vocab, 'all_test')

    return train_dict, valid_dict, test_dict