예제 #1
0
def load_deepfix_ac_data_for_generator(filter_unk=False):
    vocab = create_deepfix_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])
    df = read_deepfix_ac_data()
    df = convert_deepfix_to_c_code(df)

    tokenize_fn = tokenize_by_clex_fn()
    parse_test_param = [vocab, tokenize_fn, True]
    df_data = parse_test_tokens(df, 'deepfix', *parse_test_param)
    df_data = list(df_data)

    if filter_unk:
        unk_id = vocab.word_to_id(vocab.unk)
        df_data[0] = df_data[0][df_data[0].map(lambda x: unk_id not in x)]

    df = df.loc[df_data[0].index.values]

    deepfix_dict = {
        'ac_token_id_list': df_data[0],
        'includes': df['includes'],
        'distance': df['errorcount'],
        'ac_token_name_list': df_data[1]
    }
    return deepfix_dict
예제 #2
0
def read_deepfix_dataset():
    tokenize_fn = tokenize_by_clex_fn()
    vocabulary = create_deepfix_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])
    transformer = TransformVocabularyAndSLK(tokenize_fn=tokenize_fn,
                                            vocab=vocabulary)
    train_dataset, valid_dataset, test_dataset, _ = load_deepfix_sample_iterative_dataset(
        is_debug=False,
        vocabulary=vocabulary,
        mask_transformer=transformer,
        do_flatten=True,
        use_ast=True,
        do_multi_step_sample=False,
        merge_action=False)
    return train_dataset, valid_dataset, test_dataset
예제 #3
0
def load_customer_code_data_for_iterate(df):
    vocab = create_deepfix_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])
    # df = read_deepfix_error_data()
    df = convert_deepfix_to_c_code(df)

    tokenize_fn = tokenize_by_clex_fn()
    parse_test_param = [vocab, tokenize_fn, True]
    df_data = parse_test_tokens(df, 'deepfix', *parse_test_param)

    df = df.loc[df_data[0].index.values]

    deepfix_dict = {
        'error_token_id_list': df_data[0],
        'includes': df['includes'],
        'distance': df['errorcount'],
        'error_token_name_list': df_data[1],
        'id': df['code_id']
    }
    return deepfix_dict
예제 #4
0
    test_dataset = IterateErrorDataSet(
        df,
        vocabulary,
        'deepfix',
        transformer_vocab_slk=mask_transformer,
        do_flatten=do_flatten,
        use_ast=use_ast,
        do_multi_step_sample=do_multi_step_sample)
    info_output = "There are {} parsed data in the deepfix dataset".format(
        len(test_dataset))
    print(info_output)
    return None, None, test_dataset, None


if __name__ == '__main__':
    vocab = create_deepfix_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])
    tokenize_fn = tokenize_by_clex_fn()
    transformer = TransformVocabularyAndSLK(tokenize_fn=tokenize_fn,
                                            vocab=vocab)
    train_dataset = load_deepfix_ac_code_for_generate_dataset(
        is_debug=True,
        vocabulary=vocab,
        mask_transformer=transformer,
        do_flatten=True,
        use_ast=False)
    print(len(train_dataset))
예제 #5
0
def load_common_error_data(addition_infomation=False, data_type=None):
    if data_type == 'deepfix':
        vocab = create_deepfix_common_error_vocabulary(
            begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
            end_tokens=['<END>', '<INNER_END>'],
            unk_token='<UNK>',
            addition_tokens=['<PAD>'])

        train, vaild, test = read_fake_common_deepfix_error_dataset_with_limit_length(
            500)
        train = convert_c_code_fields_to_cpp_fields(train,
                                                    convert_include=False)
        vaild = convert_c_code_fields_to_cpp_fields(vaild,
                                                    convert_include=False)
        test = convert_c_code_fields_to_cpp_fields(test, convert_include=False)
    else:
        vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'],
                                               end_tokens=['<END>'],
                                               unk_token='<UNK>',
                                               addition_tokens=['<GAP>'])
        train, vaild, test = read_fake_common_c_error_dataset_with_limit_length(
            MAX_TOKEN_LENGTH)
        train = convert_c_code_fields_to_cpp_fields(train)
        vaild = convert_c_code_fields_to_cpp_fields(vaild)
        test = convert_c_code_fields_to_cpp_fields(test)

    tokenize_fn = tokenize_by_clex_fn()

    parse_param = [vocab, action_list_sorted, tokenize_fn]
    parse_test_param = [vocab, tokenize_fn]

    train_data = parse_error_tokens_and_action_map(train, 'train',
                                                   *parse_param)
    vaild_data = parse_error_tokens_and_action_map(vaild, 'valid',
                                                   *parse_param)
    test_data = parse_error_tokens_and_action_map(test, 'test', *parse_param)
    # vaild_data = parse_test_tokens(vaild, 'valid', *parse_test_param)
    # test_data = parse_test_tokens(test, 'test', *parse_test_param)

    train = train.loc[train_data[0].index.values]
    vaild = vaild.loc[vaild_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_code_word_id': train_data[0],
        'ac_code_word_id': train_data[1],
        'token_map': train_data[2],
        'error_mask': train_data[3],
        'includes': train['includes'],
        'is_copy': train_data[4],
        'pointer_map': train_data[5],
        'distance': train_data[6],
        'error_code_word': train_data[7]
    }
    valid_dict = {
        'error_code_word_id': vaild_data[0],
        'ac_code_word_id': vaild_data[1],
        'token_map': vaild_data[2],
        'error_mask': vaild_data[3],
        'includes': vaild['includes'],
        'is_copy': vaild_data[4],
        'pointer_map': vaild_data[5],
        'distance': vaild_data[6],
        'error_code_word': vaild_data[7]
    }
    test_dict = {
        'error_code_word_id': test_data[0],
        'ac_code_word_id': test_data[1],
        'token_map': test_data[2],
        'error_mask': test_data[3],
        'includes': test['includes'],
        'is_copy': test_data[4],
        'pointer_map': test_data[5],
        'distance': test_data[6],
        'error_code_word': test_data[7]
    }

    if addition_infomation:
        train_dict = add_c_common_code_original_info(data_dict=train_dict,
                                                     df=train)
        valid_dict = add_c_common_code_original_info(data_dict=valid_dict,
                                                     df=vaild)
        test_dict = add_c_common_code_original_info(data_dict=test_dict,
                                                    df=test)

    # valid_dict = {'error_code_word_id': vaild_data, 'includes': vaild['includes']}
    # test_dict = {'error_code_word_id': test_data, 'includes': test['includes']}

    # train_data_set = CCodeErrorDataSet(pd.DataFrame(train_dict), vocab, 'train')
    # valid_data_set = CCodeErrorDataSet(pd.DataFrame(valid_dict), vocab, 'all_valid')
    # test_data_set = CCodeErrorDataSet(pd.DataFrame(test_dict), vocab, 'all_test')

    return train_dict, valid_dict, test_dict