def load_deepfix_masked_datadict(sample_count=None):
    vocab = load_deepfix_common_error_vocabulary()
    train, valid, test = read_fake_common_deepfix_error_dataset_with_limit_length(500)

    tokenize_fn = tokenize_by_clex_fn()
    position_dict = read_deepfix_masked_position()

    def prepare_df(df):
        if sample_count is not None and sample_count > 0:
            df = df.sample(sample_count)
        df = convert_c_code_fields_to_cpp_fields(df, convert_include=False)
        df = add_masked_position_column(df, position_dict)
        return df

    train = prepare_df(train)
    valid = prepare_df(valid)
    test = prepare_df(test)

    parse_param = (vocab, tokenize_fn)

    train_data = parse_masked_code(train, *parse_param)
    valid_data = parse_masked_code(valid, *parse_param)
    test_data = parse_masked_code(test, *parse_param)

    train_dict = {**train_data, 'includes': train['includes'], 'id': train['id'],
                  'masked_positions': train['masked_positions']}
    valid_dict = {**valid_data, 'includes': valid['includes'], 'id': valid['id'],
                  'masked_positions': valid['masked_positions']}
    test_dict = {**test_data, 'includes': test['includes'], 'id': test['id'],
                 'masked_positions': test['masked_positions']}

    return train_dict, valid_dict, test_dict
def load_fake_deepfix_dataset_iterate_error_data(is_debug=False):
    vocab = create_deepfix_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])

    train, valid, test = read_fake_common_deepfix_error_dataset_with_limit_length(
        500)

    if is_debug:
        train = train.sample(100)
        valid = valid.sample(100)
        test = test.sample(100)

    tokenize_fn = tokenize_by_clex_fn()
    parse_fn = parse_xy_sequence
    add_begin_end_label = True
    parse_param = [vocab, tokenize_fn, add_begin_end_label]

    train_data = parse_fn(train, 'train', *parse_param)
    valid_data = parse_fn(valid, 'valid', *parse_param)
    test_data = parse_fn(test, 'test', *parse_param)

    train = train.loc[train_data[0].index.values]
    valid = valid.loc[valid_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_token_id_list': train_data[0],
        'error_token_name_list': train_data[1],
        'target_token_id_list': train_data[2],
        'target_token_name_list': train_data[3],
        'includes': train['includes'],
        'distance': train['distance'],
        'id': train['id'],
    }
    valid_dict = {
        'error_token_id_list': valid_data[0],
        'error_token_name_list': valid_data[1],
        'target_token_id_list': valid_data[2],
        'target_token_name_list': valid_data[3],
        'includes': valid['includes'],
        'distance': valid['distance'],
        'id': valid['id'],
    }
    test_dict = {
        'error_token_id_list': test_data[0],
        'error_token_name_list': test_data[1],
        'target_token_id_list': test_data[2],
        'target_token_name_list': test_data[3],
        'includes': test['includes'],
        'distance': test['distance'],
        'id': test['id'],
    }

    return train_dict, valid_dict, test_dict
def read_deepfix_modify_action_token():
    train_df, _, _ = read_fake_common_deepfix_error_dataset_with_limit_length(
        500)
    train_df['modify_action_list'] = train_df['modify_action_list'].map(
        json.loads)
    extract_to_token_fn = lambda actions: [act['to_char'] for act in actions]
    act_tokens = [
        extract_to_token_fn(actions)
        for actions in train_df['modify_action_list']
    ]
    return act_tokens
def get_deepfix_train_error_tokens_without_includes():
    train_df, _, _ = read_fake_common_deepfix_error_dataset_with_limit_length(
        500)
    transform_lextoken_to_token_fn = lambda token_list: [
        i.value for i in token_list
    ]
    tokenize_fn = tokenize_by_clex_fn()
    parse_tokens = [
        transform_lextoken_to_token_fn(tokenize_fn(code))
        for code in train_df['code']
    ]
    return parse_tokens
Пример #5
0
def load_fake_deepfix_dataset_iterate_error_data(do_flatten=False,
                                                 merge_action=True,
                                                 sequence_output=False):
    vocab = load_deepfix_common_error_vocabulary()

    train, valid, test = read_fake_common_deepfix_error_dataset_with_limit_length(
        500)

    train = convert_c_code_fields_to_cpp_fields(train, convert_include=False)
    valid = convert_c_code_fields_to_cpp_fields(valid, convert_include=False)
    test = convert_c_code_fields_to_cpp_fields(test, convert_include=False)

    tokenize_fn = tokenize_by_clex_fn()
    parse_fn = parse_iterative_sample_action_error_code
    parse_param = [
        vocab, action_list_sorted_no_reverse, tokenize_fn, merge_action,
        sequence_output
    ]

    train_data = parse_fn(train, 'train', *parse_param)
    valid_data = parse_fn(valid, 'valid', *parse_param)
    test_data = parse_fn(test, 'test', *parse_param)

    train = train.loc[train_data[0].index.values]
    valid = valid.loc[valid_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_token_id_list': train_data[0],
        'sample_error_id_list': train_data[1],
        'sample_ac_id_list': train_data[2],
        'ac_pos_list': train_data[3],
        'error_pos_list': train_data[4],
        'includes': train['includes'],
        'distance': train['distance'],
        'ac_code_ids': train_data[5],
        'is_copy_list': train_data[6],
        'copy_pos_list': train_data[7],
        'sample_mask_list': train_data[8],
        'error_token_name_list': train_data[9],
        'id': train['id'],
        'target_ac_token_id_list': train_data[10],
        'ac_code_name_with_labels': train_data[11]
    }
    valid_dict = {
        'error_token_id_list': valid_data[0],
        'sample_error_id_list': valid_data[1],
        'sample_ac_id_list': valid_data[2],
        'ac_pos_list': valid_data[3],
        'error_pos_list': valid_data[4],
        'includes': valid['includes'],
        'distance': valid['distance'],
        'ac_code_ids': valid_data[5],
        'is_copy_list': valid_data[6],
        'copy_pos_list': valid_data[7],
        'sample_mask_list': valid_data[8],
        'error_token_name_list': valid_data[9],
        'id': valid['id'],
        'target_ac_token_id_list': valid_data[10],
        'ac_code_name_with_labels': valid_data[11]
    }
    test_dict = {
        'error_token_id_list': test_data[0],
        'sample_error_id_list': test_data[1],
        'sample_ac_id_list': test_data[2],
        'ac_pos_list': test_data[3],
        'error_pos_list': test_data[4],
        'includes': test['includes'],
        'distance': test['distance'],
        'ac_code_ids': test_data[5],
        'is_copy_list': test_data[6],
        'copy_pos_list': test_data[7],
        'sample_mask_list': test_data[8],
        'error_token_name_list': test_data[9],
        'id': test['id'],
        'target_ac_token_id_list': test_data[10],
        'ac_code_name_with_labels': test_data[11]
    }

    if do_flatten:
        train_dict = flatten_iterative_data(train_dict)
        valid_dict = flatten_iterative_data(valid_dict)
        test_dict = flatten_iterative_data(test_dict)

    return train_dict, valid_dict, test_dict
Пример #6
0
def load_common_error_data(addition_infomation=False, data_type=None):
    if data_type == 'deepfix':
        vocab = create_deepfix_common_error_vocabulary(
            begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
            end_tokens=['<END>', '<INNER_END>'],
            unk_token='<UNK>',
            addition_tokens=['<PAD>'])

        train, vaild, test = read_fake_common_deepfix_error_dataset_with_limit_length(
            500)
        train = convert_c_code_fields_to_cpp_fields(train,
                                                    convert_include=False)
        vaild = convert_c_code_fields_to_cpp_fields(vaild,
                                                    convert_include=False)
        test = convert_c_code_fields_to_cpp_fields(test, convert_include=False)
    else:
        vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'],
                                               end_tokens=['<END>'],
                                               unk_token='<UNK>',
                                               addition_tokens=['<GAP>'])
        train, vaild, test = read_fake_common_c_error_dataset_with_limit_length(
            MAX_TOKEN_LENGTH)
        train = convert_c_code_fields_to_cpp_fields(train)
        vaild = convert_c_code_fields_to_cpp_fields(vaild)
        test = convert_c_code_fields_to_cpp_fields(test)

    tokenize_fn = tokenize_by_clex_fn()

    parse_param = [vocab, action_list_sorted, tokenize_fn]
    parse_test_param = [vocab, tokenize_fn]

    train_data = parse_error_tokens_and_action_map(train, 'train',
                                                   *parse_param)
    vaild_data = parse_error_tokens_and_action_map(vaild, 'valid',
                                                   *parse_param)
    test_data = parse_error_tokens_and_action_map(test, 'test', *parse_param)
    # vaild_data = parse_test_tokens(vaild, 'valid', *parse_test_param)
    # test_data = parse_test_tokens(test, 'test', *parse_test_param)

    train = train.loc[train_data[0].index.values]
    vaild = vaild.loc[vaild_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_code_word_id': train_data[0],
        'ac_code_word_id': train_data[1],
        'token_map': train_data[2],
        'error_mask': train_data[3],
        'includes': train['includes'],
        'is_copy': train_data[4],
        'pointer_map': train_data[5],
        'distance': train_data[6],
        'error_code_word': train_data[7]
    }
    valid_dict = {
        'error_code_word_id': vaild_data[0],
        'ac_code_word_id': vaild_data[1],
        'token_map': vaild_data[2],
        'error_mask': vaild_data[3],
        'includes': vaild['includes'],
        'is_copy': vaild_data[4],
        'pointer_map': vaild_data[5],
        'distance': vaild_data[6],
        'error_code_word': vaild_data[7]
    }
    test_dict = {
        'error_code_word_id': test_data[0],
        'ac_code_word_id': test_data[1],
        'token_map': test_data[2],
        'error_mask': test_data[3],
        'includes': test['includes'],
        'is_copy': test_data[4],
        'pointer_map': test_data[5],
        'distance': test_data[6],
        'error_code_word': test_data[7]
    }

    if addition_infomation:
        train_dict = add_c_common_code_original_info(data_dict=train_dict,
                                                     df=train)
        valid_dict = add_c_common_code_original_info(data_dict=valid_dict,
                                                     df=vaild)
        test_dict = add_c_common_code_original_info(data_dict=test_dict,
                                                    df=test)

    # valid_dict = {'error_code_word_id': vaild_data, 'includes': vaild['includes']}
    # test_dict = {'error_code_word_id': test_data, 'includes': test['includes']}

    # train_data_set = CCodeErrorDataSet(pd.DataFrame(train_dict), vocab, 'train')
    # valid_data_set = CCodeErrorDataSet(pd.DataFrame(valid_dict), vocab, 'all_valid')
    # test_data_set = CCodeErrorDataSet(pd.DataFrame(test_dict), vocab, 'all_test')

    return train_dict, valid_dict, test_dict