示例#1
0
def load_deepfix_ac_data_for_generator(filter_unk=False):
    vocab = create_deepfix_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])
    df = read_deepfix_ac_data()
    df = convert_deepfix_to_c_code(df)

    tokenize_fn = tokenize_by_clex_fn()
    parse_test_param = [vocab, tokenize_fn, True]
    df_data = parse_test_tokens(df, 'deepfix', *parse_test_param)
    df_data = list(df_data)

    if filter_unk:
        unk_id = vocab.word_to_id(vocab.unk)
        df_data[0] = df_data[0][df_data[0].map(lambda x: unk_id not in x)]

    df = df.loc[df_data[0].index.values]

    deepfix_dict = {
        'ac_token_id_list': df_data[0],
        'includes': df['includes'],
        'distance': df['errorcount'],
        'ac_token_name_list': df_data[1]
    }
    return deepfix_dict
def load_deepfix_masked_datadict(sample_count=None):
    vocab = load_deepfix_common_error_vocabulary()
    train, valid, test = read_fake_common_deepfix_error_dataset_with_limit_length(500)

    tokenize_fn = tokenize_by_clex_fn()
    position_dict = read_deepfix_masked_position()

    def prepare_df(df):
        if sample_count is not None and sample_count > 0:
            df = df.sample(sample_count)
        df = convert_c_code_fields_to_cpp_fields(df, convert_include=False)
        df = add_masked_position_column(df, position_dict)
        return df

    train = prepare_df(train)
    valid = prepare_df(valid)
    test = prepare_df(test)

    parse_param = (vocab, tokenize_fn)

    train_data = parse_masked_code(train, *parse_param)
    valid_data = parse_masked_code(valid, *parse_param)
    test_data = parse_masked_code(test, *parse_param)

    train_dict = {**train_data, 'includes': train['includes'], 'id': train['id'],
                  'masked_positions': train['masked_positions']}
    valid_dict = {**valid_data, 'includes': valid['includes'], 'id': valid['id'],
                  'masked_positions': valid['masked_positions']}
    test_dict = {**test_data, 'includes': test['includes'], 'id': test['id'],
                 'masked_positions': test['masked_positions']}

    return train_dict, valid_dict, test_dict
def preprocess_code(code, cpp_file_path=COMPILE_TMP_PATH, tokenize_fn=None):
    if not compile_c_code_by_gcc(code, cpp_file_path):
        return None, None, None, None, None
    code = init_code(code)
    if not compile_c_code_by_gcc(code, cpp_file_path):
        return None, None, None, None, None
    before_code = code
    after_code = before_code
    error_count_range = (1, 9)
    if tokenize_fn is None:
        tokenize_fn = tokenize_by_clex_fn()

    count = 0
    action_maplist = []
    error_character_maplist = []
    error_count = -1
    while compile_c_code_by_gcc(after_code, cpp_file_path):
        cod = before_code
        # cod = remove_blank(cod)
        # cod = remove_comments(cod)
        # cod = remove_blank_line(cod)
        count += 1
        # before_code = cod
        before_code, after_code, action_maplist, error_character_maplist, error_count = create_error_code(
            cod, error_count_range=error_count_range, tokenize_fn=tokenize_fn)
        if before_code is None:
            return None, None, None, None, None
        if count > compile_max_count:
            return None, None, None, None, None

    return before_code, after_code, action_maplist, error_character_maplist, error_count
    def __init__(self,
                 vocabulary,
                 mask_language_model_param,
                 detect_token_model_param,
                 train_type,
                 ignore_id,
                 pad_id,
                 check_error_task,
                 only_predict_masked=False):
        '''
        :param vocabulary:
        :param mask_language_model_param:
        :param detect_token_model_param:
        :param train_type: 'gene', 'only_disc', 'both', 'none'
        '''
        super().__init__()
        self.vocabulary = vocabulary
        self.generator = MaskedLanguageModel(**mask_language_model_param)
        self.discriminator = ErrorDetectorModel(**detect_token_model_param)
        self.ignore_id = ignore_id
        self.pad_id = pad_id
        self.check_error_task = check_error_task
        self.only_predict_masked = only_predict_masked
        from common.pycparser_util import tokenize_by_clex_fn
        self.tokenize_fn = tokenize_by_clex_fn()

        self.train_type = ''
        self.change_model_train_type(train_type)
示例#5
0
 def tokenize_code(df):
     tokenize_fn = tokenize_by_clex_fn()
     df['similar_tokenize'] = df['similar_code_without_include'].map(
         tokenize_fn)
     df = df[df['similar_tokenize'].map(lambda x: x is not None)]
     df['sample_tokenize'] = df['sample_code'].map(tokenize_fn)
     df = df[df['sample_tokenize'].map(lambda x: x is not None)]
     return df
def read_fake_random_c_error_dataset_with_limit_length(limit_length=500):
    dfs = read_fake_random_c_error_dataset()
    tokenize_fn = tokenize_by_clex_fn()

    train, valid, test = [
        filter_length(df, limit_length, tokenize_fn) for df in dfs
    ]
    return train, valid, test
def load_fake_deepfix_dataset_iterate_error_data(is_debug=False):
    vocab = create_deepfix_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])

    train, valid, test = read_fake_common_deepfix_error_dataset_with_limit_length(
        500)

    if is_debug:
        train = train.sample(100)
        valid = valid.sample(100)
        test = test.sample(100)

    tokenize_fn = tokenize_by_clex_fn()
    parse_fn = parse_xy_sequence
    add_begin_end_label = True
    parse_param = [vocab, tokenize_fn, add_begin_end_label]

    train_data = parse_fn(train, 'train', *parse_param)
    valid_data = parse_fn(valid, 'valid', *parse_param)
    test_data = parse_fn(test, 'test', *parse_param)

    train = train.loc[train_data[0].index.values]
    valid = valid.loc[valid_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_token_id_list': train_data[0],
        'error_token_name_list': train_data[1],
        'target_token_id_list': train_data[2],
        'target_token_name_list': train_data[3],
        'includes': train['includes'],
        'distance': train['distance'],
        'id': train['id'],
    }
    valid_dict = {
        'error_token_id_list': valid_data[0],
        'error_token_name_list': valid_data[1],
        'target_token_id_list': valid_data[2],
        'target_token_name_list': valid_data[3],
        'includes': valid['includes'],
        'distance': valid['distance'],
        'id': valid['id'],
    }
    test_dict = {
        'error_token_id_list': test_data[0],
        'error_token_name_list': test_data[1],
        'target_token_id_list': test_data[2],
        'target_token_name_list': test_data[3],
        'includes': test['includes'],
        'distance': test['distance'],
        'id': test['id'],
    }

    return train_dict, valid_dict, test_dict
def read_filter_without_include_ac_token():
    train_df, _, _ = read_fake_common_c_error_dataset_with_limit_length(500)
    transform_lextoken_to_token_fn = lambda token_list: [
        i.value for i in token_list
    ]
    tokenize_fn = tokenize_by_clex_fn()
    parse_tokens = [
        transform_lextoken_to_token_fn(tokenize_fn(code))
        for code in train_df['similar_code']
    ]
    return parse_tokens
def get_deepfix_train_error_tokens_without_includes():
    train_df, _, _ = read_fake_common_deepfix_error_dataset_with_limit_length(
        500)
    transform_lextoken_to_token_fn = lambda token_list: [
        i.value for i in token_list
    ]
    tokenize_fn = tokenize_by_clex_fn()
    parse_tokens = [
        transform_lextoken_to_token_fn(tokenize_fn(code))
        for code in train_df['code']
    ]
    return parse_tokens
def read_fake_common_deepfix_error_dataset_with_limit_length(limit_length=500, random_seed=100):
    data_df = read_fake_deepfix_common_error_records()

    tokenize_fn = tokenize_by_clex_fn()
    data_df = filter_length(data_df, limit_length, tokenize_fn)
    print('after filter code length: {}'.format(len(data_df)))

    valid_df = data_df.sample(frac=0.05, random_state=random_seed)
    data_df = data_df.drop(valid_df.index)
    test_df = data_df.sample(frac=0.05, random_state=random_seed)
    train_df = data_df.drop(test_df.index)

    return train_df, valid_df, test_df
示例#11
0
def sample_masked_position_main():
    data_df = read_fake_deepfix_common_error_records()
    data_df = convert_c_code_fields_to_cpp_fields(data_df)

    tokenize_fn = tokenize_by_clex_fn()
    data_df = tokenize_ac_code(data_df, tokenize_fn)

    data_df['ac_code_length'] = data_df['ac_code_name'].map(len)
    data_df['masked_positions'] = data_df['ac_code_length'].map(lambda l: random_position(l, frac=0.4))
    data_df['masked_positions_token'] = data_df.apply(lambda one: [one['ac_code_name'][pos] for pos in one['masked_positions']], axis=1)

    data_dict = {one['id']: (one['masked_positions'], one['masked_positions_token']) for i, one in data_df.iterrows()}
    # data_dict = {i: (masked_poses, masked_toks) for i, masked_poses, masked_toks in zip(
    #              data_df['id'].tolist(), data_df['masked_positions'].tolist(), data_df['masked_tokens'].tolist())}

    save_sample_masked_position_dict(data_dict, save_path=deepfix_masked_position_path)
示例#12
0
def main():
    begin_tokens = ['<BEGIN>']
    end_tokens = ['<END>']
    unk_token = '<UNK>'
    addition_tokens = ['<GAP>']
    vocabulary = create_common_error_vocabulary(
        begin_tokens=begin_tokens,
        end_tokens=end_tokens,
        unk_token=unk_token,
        addition_tokens=addition_tokens)
    tokenize_fn = tokenize_by_clex_fn()
    transformer = TransformVocabularyAndSLK(vocabulary, tokenize_fn)

    code = r'''
    int main(){
        int a = 0;
        a = 1 
    '''
    token_list = tokenize_fn(code)
    print(token_list)
    token_list = iter(token_list)
    t_parser = transformer.create_new_slk_iterator()
    for t, type_id in t_parser:
        print(t)
        print(type_id)
        try:
            tt = next(token_list)
            print(tt)
            t_parser.add_token(tt)
        except StopIteration:
            break
    t_parser_1 = copy.deepcopy(t_parser)
    t_parser_2 = copy.deepcopy(t_parser)

    t_parser_1.add_token(
        transformer.id_to_token_dict[transformer.vocab.word_to_id('+')])
    t, type_id = next(t_parser_1)
    print('t1', t)
    print('t1', type_id)

    t_parser_2.add_token(
        transformer.id_to_token_dict[transformer.vocab.word_to_id(';')])
    t, type_id = next(t_parser_2)
    print('t2', t)
    print('t2', type_id)
示例#13
0
def load_grammar_sample_common_error_data():
    """
    not finish
    :return:
    """
    vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'],
                                           end_tokens=['<END>'],
                                           unk_token='<UNK>',
                                           addition_tokens=['<GAP>'])
    train_df, valid_df, test_df = read_grammar_sample_error_data()
    train_df = convert_c_code_fields_to_cpp_fields(train_df,
                                                   convert_include=False)
    valid_df = convert_c_code_fields_to_cpp_fields(valid_df,
                                                   convert_include=False)
    test_df = convert_c_code_fields_to_cpp_fields(test_df,
                                                  convert_include=False)

    tokenize_fn = tokenize_by_clex_fn()
示例#14
0
def read_deepfix_dataset():
    tokenize_fn = tokenize_by_clex_fn()
    vocabulary = create_deepfix_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])
    transformer = TransformVocabularyAndSLK(tokenize_fn=tokenize_fn,
                                            vocab=vocabulary)
    train_dataset, valid_dataset, test_dataset, _ = load_deepfix_sample_iterative_dataset(
        is_debug=False,
        vocabulary=vocabulary,
        mask_transformer=transformer,
        do_flatten=True,
        use_ast=True,
        do_multi_step_sample=False,
        merge_action=False)
    return train_dataset, valid_dataset, test_dataset
示例#15
0
def load_deepfix_error_data():
    vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'],
                                           end_tokens=['<END>'],
                                           unk_token='<UNK>',
                                           addition_tokens=['<GAP>'])
    df = read_deepfix_error_data()
    df = convert_deepfix_to_c_code(df)

    tokenize_fn = tokenize_by_clex_fn()
    parse_test_param = [vocab, tokenize_fn]
    df_data = parse_test_tokens(df, 'deepfix', *parse_test_param)

    df = df.loc[df_data[0].index.values]

    deepfix_dict = {
        'error_code_word_id': df_data[0],
        'includes': df['includes'],
        'distance': df['errorcount'],
        'error_code_word_name': df_data[1]
    }
    return deepfix_dict
示例#16
0
def load_generate_code_for_solver_model_iterate_data(df,
                                                     convert_field_fn=None,
                                                     convert_field_dict={},
                                                     do_flatten=False,
                                                     vocabulary=None):
    if convert_field_fn is not None:
        df = convert_field_fn(df, **convert_field_dict)
    df['action_character_list'] = df['action_character_list'].map(
        convert_action_map_to_old_action)

    tokenize_fn = tokenize_by_clex_fn()
    parse_fn = parse_iterative_sample_action_error_code
    parse_param = [vocabulary, action_list_sorted_no_reverse, tokenize_fn]

    df_data = parse_fn(df, 'train', *parse_param)
    df = df.loc[df_data[0].index.values]

    df_dict = {
        'error_token_id_list': df_data[0],
        'sample_error_id_list': df_data[1],
        'sample_ac_id_list': df_data[2],
        'ac_pos_list': df_data[3],
        'error_pos_list': df_data[4],
        'includes': df['includes'],
        'distance': df['distance'],
        'ac_code_ids': df_data[5],
        'is_copy_list': df_data[6],
        'copy_pos_list': df_data[7],
        'sample_mask_list': df_data[8],
        'error_token_name_list': df_data[9],
        'id': df['id'],
        'target_ac_token_id_list': df_data[10],
        'ac_code_name_with_labels': df_data[11],
    }

    if do_flatten:
        df_dict = flatten_iterative_data(df_dict)
    return df_dict
示例#17
0
def load_customer_code_data_for_iterate(df):
    vocab = create_deepfix_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])
    # df = read_deepfix_error_data()
    df = convert_deepfix_to_c_code(df)

    tokenize_fn = tokenize_by_clex_fn()
    parse_test_param = [vocab, tokenize_fn, True]
    df_data = parse_test_tokens(df, 'deepfix', *parse_test_param)

    df = df.loc[df_data[0].index.values]

    deepfix_dict = {
        'error_token_id_list': df_data[0],
        'includes': df['includes'],
        'distance': df['errorcount'],
        'error_token_name_list': df_data[1],
        'id': df['code_id']
    }
    return deepfix_dict
def make_fake_code(que_read: mp.Queue, que_write: mp.Queue, ind: int):
    preprocess_logger.info('Start Make Fake Code Process {}'.format(ind))
    tmp_code_file_path = os.path.join(COMPILE_TMP_PATH,
                                      'code' + str(ind) + '.c')
    timeout_count = 0
    count = 0
    success_count = 0
    err_count = 0
    fail_count = 0
    repeat_count = 0
    tokenize_fn = tokenize_by_clex_fn()
    while True:
        if timeout_count >= 5:
            break

        if count % 10 == 0:
            preprocess_logger.info(
                "Process {} | count: {} | error_count: {} | fail_count: {} | repeat_count: {}"
                .format(ind, count, err_count, fail_count, repeat_count))

        try:
            item = que_read.get(timeout=600)
        except queue.Empty:
            timeout_count += 1
            continue
        except TimeoutError:
            timeout_count += 1
            continue

        timeout_count = 0
        count += 1
        if not item:
            repeat_count += 1
            que_write.put(None)
            continue

        # item['originalcode'] = item['originalcode'].replace('\ufeff', '').replace('\u3000', ' ')

        try:
            before_code, after_code, action_maplist, error_character_maplist, error_count = preprocess_code(
                item['originalcode'],
                cpp_file_path=tmp_code_file_path,
                tokenize_fn=tokenize_fn)
        except Exception as e:
            preprocess_logger.info('error info: ' + str(e))
            before_code = None
            after_code = None
            action_maplist = None
            error_character_maplist = None
            error_count = 1

        count += 1
        if before_code:
            success_count += 1
            item['ac_code'] = before_code
            item['code'] = after_code
            item['error_count'] = error_count
            error_list = list(
                map(lambda x: x.__dict__(), error_character_maplist))
            action_list = list(map(lambda x: x.__dict__(), action_maplist))
            item['error_character_maplist'] = error_list
            item['action_maplist'] = action_list
            que_write.put(item)
        else:
            item['try_count'] += 1
            if item['try_count'] < error_max_count:
                err_count += 1
                que_read.put(item)
            else:
                fail_count += 1
                que_write.put(None)

    preprocess_logger.info(
        "Process {} | count: {} | error_count: {} | fail_count: {} | repeat_count: {}"
        .format(ind, count, err_count, fail_count, repeat_count))
    preprocess_logger.info('End Make Fake Code Process {}'.format(ind))
示例#19
0
def multi_step_evaluate(model,
                        dataset,
                        batch_size,
                        parse_input_batch_data_fn,
                        parse_target_batch_data_fn,
                        do_sample=False,
                        print_output=False,
                        create_output_ids_fn=None,
                        evaluate_obj_list=[],
                        expand_output_and_target_fn=None,
                        max_step_times=0,
                        vocabulary=None,
                        file_path='',
                        create_multi_step_next_input_batch_fn=None,
                        extract_includes_fn=lambda x: x['includes'],
                        print_output_fn=None,
                        do_beam_search=False,
                        target_file_path='main.out',
                        log_file_path='main.log',
                        do_save_data=False,
                        max_save_distance=None,
                        save_records_to_database=False,
                        db_path='',
                        table_name='',
                        change_output_records_to_batch_fn=None,
                        create_save_database_records_fn=None,
                        error_stop_type='normal'):
    total_loss = to_cuda(torch.Tensor([0]))
    total_batch = to_cuda(torch.Tensor([0]))
    steps = 0
    compile_evaluator = CompileResultEvaluate()
    compile_evaluator.clear_result()
    for o in evaluate_obj_list:
        o.clear_result()

    model.eval()

    from common.pycparser_util import tokenize_by_clex_fn
    tokenize_fn = tokenize_by_clex_fn()
    save_data_dict = {}
    save_records_list = []

    # file_path = add_pid_to_file_path(file_path)
    # target_file_path = add_pid_to_file_path(target_file_path)

    with tqdm(total=len(dataset)) as pbar:
        with torch.no_grad():
            for batch_data in data_loader(dataset,
                                          batch_size=batch_size,
                                          drop_last=False):
                model.zero_grad()

                input_data = batch_data.copy()
                final_output_list = []
                output_records_list = []
                continue_list = [True for _ in range(batch_size)]
                result_list = [False for _ in range(batch_size)]
                result_records_list = []
                sample_steps = [-1 for _ in range(batch_size)]
                error_count_list = batch_data['error_count']

                for i in range(max_step_times):
                    model_input = parse_input_batch_data_fn(input_data,
                                                            do_sample=True)

                    model_output = model.forward(*model_input,
                                                 do_sample=True,
                                                 do_beam_search=do_beam_search)

                    input_data, final_output, output_records, final_output_name_list, continue_list = create_multi_step_next_input_batch_fn(
                        input_data, model_input, model_output, continue_list,
                        do_beam_search)
                    final_output_list += [final_output]
                    output_records_list += [output_records]

                    continue_list, result_list, cur_error_count_list = compile_code_ids_list(
                        final_output_name_list,
                        continue_list,
                        result_list,
                        vocabulary=vocabulary,
                        includes_list=extract_includes_fn(input_data),
                        file_path=file_path,
                        target_file_path=target_file_path,
                        log_file_path=log_file_path,
                        do_compile_pool=True,
                        need_transform=False)

                    if error_stop_type == 'oracle':
                        reject_list = [
                            True if c and n > o else False
                            for c, o, n in zip(continue_list, error_count_list,
                                               cur_error_count_list)
                        ]
                    elif error_stop_type == 'normal':
                        reject_list = [False for _ in range(batch_size)]
                    error_count_list = [
                        n if n < o and n >= 0 else o
                        for o, n in zip(error_count_list, cur_error_count_list)
                    ]
                    for i_f, rej in enumerate(reject_list):
                        if rej:
                            # use last output
                            final_output_name_list[i_f] = input_data[
                                'last_input_seq_name'][i_f]
                            continue_list[i_f] = False

                    sample_steps = [
                        i + 1 if s == -1 and not c and not r else s for s, c, r
                        in zip(sample_steps, continue_list, reject_list)
                    ]
                    sample_steps = [
                        i if s == -1 and not c and r else s for s, c, r in zip(
                            sample_steps, continue_list, reject_list)
                    ]

                    result_records_list += [result_list]
                    if sum(continue_list) == 0:
                        break
                sample_steps = [
                    max_step_times if s == -1 else s for s in sample_steps
                ]

                if do_save_data:
                    batch_data['input_seq_name'] = batch_data[
                        'final_output_name']
                    save_res_dict = save_addition_data(
                        original_states=batch_data,
                        states=input_data,
                        tokenize_fn=tokenize_fn,
                        batch_size=batch_size,
                        file_path=file_path,
                        target_file_path=target_file_path,
                        vocabulary=vocabulary,
                        max_distande=max_save_distance,
                        only_error=True)
                    for k, v in save_res_dict.items():
                        save_data_dict[k] = save_data_dict.get(k, []) + v

                if save_records_to_database:
                    batch_output_records = change_output_records_to_batch_fn(
                        output_records_list, sample_steps)
                    records_list = create_save_database_records_fn(
                        batch_data, sample_steps, final_output_name_list,
                        result_list, batch_output_records, input_data)
                    save_records_list += records_list

                step_output = 'in evaluate step {}: '.format(steps)
                res = compile_evaluator.add_result(result_list)
                step_output += res
                for evaluator in evaluate_obj_list:
                    # customer evaluator interface
                    res = evaluator.add_result(result_list,
                                               batch_data=batch_data)
                    step_output += res
                # print(step_output)
                info(step_output)

                if print_output and steps % 1 == 0:
                    print_output_fn(output_records=output_records_list,
                                    final_output=final_output_list,
                                    batch_data=batch_data,
                                    step_i=steps,
                                    vocabulary=vocabulary,
                                    compile_result_list=result_records_list)

                steps += 1
                pbar.update(batch_size)
    evaluate_obj_list = [compile_evaluator] + evaluate_obj_list

    if save_records_to_database:
        create_table(db_path,
                     DATA_RECORDS_DEEPFIX,
                     replace_table_name=table_name)
        run_sql_statment(db_path,
                         DATA_RECORDS_DEEPFIX,
                         'insert_ignore',
                         save_records_list,
                         replace_table_name=table_name)

    if steps == 0:
        t_loss = 0
    else:
        t_loss = (total_loss / steps).item()
    return evaluate_obj_list, t_loss, save_data_dict
示例#20
0
    test_dataset = IterateErrorDataSet(
        df,
        vocabulary,
        'deepfix',
        transformer_vocab_slk=mask_transformer,
        do_flatten=do_flatten,
        use_ast=use_ast,
        do_multi_step_sample=do_multi_step_sample)
    info_output = "There are {} parsed data in the deepfix dataset".format(
        len(test_dataset))
    print(info_output)
    return None, None, test_dataset, None


if __name__ == '__main__':
    vocab = create_deepfix_common_error_vocabulary(
        begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
        end_tokens=['<END>', '<INNER_END>'],
        unk_token='<UNK>',
        addition_tokens=['<PAD>'])
    tokenize_fn = tokenize_by_clex_fn()
    transformer = TransformVocabularyAndSLK(tokenize_fn=tokenize_fn,
                                            vocab=vocab)
    train_dataset = load_deepfix_ac_code_for_generate_dataset(
        is_debug=True,
        vocabulary=vocab,
        mask_transformer=transformer,
        do_flatten=True,
        use_ast=False)
    print(len(train_dataset))
def create_error_code(code,
                      error_type_list=(5, 1, 4),
                      error_count_range=(1, 5),
                      tokenize_fn=None):
    code_without_include = replace_include_with_blank(code)
    include_lines = extract_include(code)
    include_line_nos = analyse_include_line_no(code, include_lines)

    try:
        if tokenize_fn is None:
            tokenize_fn = tokenize_by_clex_fn()
        code_tokens = tokenize_fn(code_without_include)
        if code_tokens is None or len(code_tokens) > 1000:
            # preprocess_logger.info('code tokens is None: {}'.format(code_without_include))
            preprocess_logger.info('code tokens is None')
            return None, None, None, None, None

    except Exception as e:
        preprocess_logger.info('tokenize code error.')
        return None, None, None, None, None

    error_count = random.randint(*error_count_range)
    action_maplist = create_multi_error(code_without_include, code_tokens,
                                        error_type_list, error_count)
    # action_mapposlist = list(map(lambda x: x.get_ac_pos(), action_maplist))
    error_character_maplist = []

    # ac_code_list = list(code)
    #
    # ac_i = 0
    # err_i = 0

    # def get_action(act_type, ac_pos):
    #     for i in action_maplist:
    #         if act_type == i.act_type and ac_pos == i.ac_pos:
    #             return i
    #     return None

    # for ac_i in range(len(ac_code_list)):
    # while ac_i < len(ac_code_list):
    #     if ac_i in action_mapposlist and get_action(act_type=DELETE, ac_pos=ac_i) != None:
    #         action = get_action(act_type=DELETE, ac_pos=ac_i)
    #         error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i)
    #         err_i += 1
    #
    #         action.err_pos = err_i
    #         ac_i += len(action.from_char)
    #
    #         error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i)
    #         err_i += 1
    #         continue
    #
    #     if ac_i in action_mapposlist and get_action(act_type=INSERT, ac_pos=ac_i) != None:
    #         action = get_action(act_type=INSERT, ac_pos=ac_i)
    #         error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i)
    #         err_i += 1
    #
    #         action.err_pos = err_i
    #         for i in range(len(action.to_char)):
    #             err_item = ERROR_CHARACTER_MAPITEM(act_type=INSERT, from_char=action.to_char[i], err_pos=err_i, ac_pos=ac_i)
    #             error_character_maplist.append(err_item)
    #             err_i += 1
    #
    #         error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i)
    #         err_i += 1
    #
    #     if ac_i in action_mapposlist and get_action(act_type=CHANGE, ac_pos=ac_i) != None:
    #         action = get_action(act_type=CHANGE, ac_pos=ac_i)
    #         error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i)
    #         err_i += 1
    #
    #         action.err_pos = err_i
    #         for i in range(len(action.to_char)):
    #             err_item = ERROR_CHARACTER_MAPITEM(act_type=CHANGE, from_char=action.to_char[i], err_pos=err_i, to_char=action.from_char, ac_pos=ac_i)
    #             err_i += 1
    #             error_character_maplist.append(err_item)
    #         ac_i += len(action.from_char)
    #
    #         error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i)
    #         err_i += 1
    #
    #     else:
    #         err_item = ERROR_CHARACTER_MAPITEM(act_type=STAY, from_char=code[ac_i], err_pos=err_i, to_char=code[ac_i],
    #                                            ac_pos=ac_i)
    #         err_i += 1
    #         error_character_maplist.append(err_item)
    #         ac_i += 1
    #
    # if ac_i in action_mapposlist and get_action(act_type=INSERT, ac_pos=ac_i) != None:
    #     action = get_action(act_type=INSERT, ac_pos=ac_i)
    #     error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i)
    #     err_i += 1
    #
    #     action.err_pos = err_i
    #     for i in range(len(action.to_char)):
    #         err_item = ERROR_CHARACTER_MAPITEM(act_type=INSERT, from_char=action.to_char[i], err_pos=err_i, ac_pos=ac_i)
    #         error_character_maplist.append(err_item)
    #         err_i += 1
    #
    #     error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i)
    #     err_i += 1

    def convert_action_list_to_operation_tuple(one_action):
        val = None
        if one_action.act_type == INSERT or one_action.act_type == CHANGE:
            val = one_action.to_char
        elif one_action.act_type == DELETE:
            val = one_action.from_char
        return [one_action.act_type, one_action.token_pos, val]

    operation_list = [
        convert_action_list_to_operation_tuple(act) for act in action_maplist
    ]
    error_tokens, _ = generate_token_action(operation_list, tokens=code_tokens)
    if error_tokens is None:
        return None, None, None, None, None
    error_code = build_code_string_from_lex_tokens(error_tokens)
    error_lines = error_code.split('\n')
    for name, line_no in zip(include_lines, include_line_nos):
        if error_lines[line_no].strip() == '':
            error_lines[line_no] = name
        else:
            # preprocess_logger.info('tokens: {}'.format(error_tokens))
            # preprocess_logger.info('code: {}'.format(error_code))
            # preprocess_logger.info('extract include: {}'.format(include_lines))
            preprocess_logger.info(
                'extract include lineno: {}'.format(include_line_nos))
            preprocess_logger.info('add include error: {}'.format(
                error_lines[line_no]))
    error_code = '\n'.join(error_lines)
    # error_code = ''.join(list(map(lambda x: x.from_char, error_character_maplist)))

    return code, error_code, action_maplist, error_character_maplist, error_count
示例#22
0
def create_default_graph(token_list):
    from common.pycparser_util import tokenize_by_clex_fn
    tokenize_fn = tokenize_by_clex_fn()
    tokens = tokenize_fn("\n" + " ".join(token_list))
    return CodeGraph(tokens, [(0, 1)], add_sequence_link=ast_config()['add_sequence_link'])
示例#23
0
def load_common_error_data(addition_infomation=False, data_type=None):
    if data_type == 'deepfix':
        vocab = create_deepfix_common_error_vocabulary(
            begin_tokens=['<BEGIN>', '<INNER_BEGIN>'],
            end_tokens=['<END>', '<INNER_END>'],
            unk_token='<UNK>',
            addition_tokens=['<PAD>'])

        train, vaild, test = read_fake_common_deepfix_error_dataset_with_limit_length(
            500)
        train = convert_c_code_fields_to_cpp_fields(train,
                                                    convert_include=False)
        vaild = convert_c_code_fields_to_cpp_fields(vaild,
                                                    convert_include=False)
        test = convert_c_code_fields_to_cpp_fields(test, convert_include=False)
    else:
        vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'],
                                               end_tokens=['<END>'],
                                               unk_token='<UNK>',
                                               addition_tokens=['<GAP>'])
        train, vaild, test = read_fake_common_c_error_dataset_with_limit_length(
            MAX_TOKEN_LENGTH)
        train = convert_c_code_fields_to_cpp_fields(train)
        vaild = convert_c_code_fields_to_cpp_fields(vaild)
        test = convert_c_code_fields_to_cpp_fields(test)

    tokenize_fn = tokenize_by_clex_fn()

    parse_param = [vocab, action_list_sorted, tokenize_fn]
    parse_test_param = [vocab, tokenize_fn]

    train_data = parse_error_tokens_and_action_map(train, 'train',
                                                   *parse_param)
    vaild_data = parse_error_tokens_and_action_map(vaild, 'valid',
                                                   *parse_param)
    test_data = parse_error_tokens_and_action_map(test, 'test', *parse_param)
    # vaild_data = parse_test_tokens(vaild, 'valid', *parse_test_param)
    # test_data = parse_test_tokens(test, 'test', *parse_test_param)

    train = train.loc[train_data[0].index.values]
    vaild = vaild.loc[vaild_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_code_word_id': train_data[0],
        'ac_code_word_id': train_data[1],
        'token_map': train_data[2],
        'error_mask': train_data[3],
        'includes': train['includes'],
        'is_copy': train_data[4],
        'pointer_map': train_data[5],
        'distance': train_data[6],
        'error_code_word': train_data[7]
    }
    valid_dict = {
        'error_code_word_id': vaild_data[0],
        'ac_code_word_id': vaild_data[1],
        'token_map': vaild_data[2],
        'error_mask': vaild_data[3],
        'includes': vaild['includes'],
        'is_copy': vaild_data[4],
        'pointer_map': vaild_data[5],
        'distance': vaild_data[6],
        'error_code_word': vaild_data[7]
    }
    test_dict = {
        'error_code_word_id': test_data[0],
        'ac_code_word_id': test_data[1],
        'token_map': test_data[2],
        'error_mask': test_data[3],
        'includes': test['includes'],
        'is_copy': test_data[4],
        'pointer_map': test_data[5],
        'distance': test_data[6],
        'error_code_word': test_data[7]
    }

    if addition_infomation:
        train_dict = add_c_common_code_original_info(data_dict=train_dict,
                                                     df=train)
        valid_dict = add_c_common_code_original_info(data_dict=valid_dict,
                                                     df=vaild)
        test_dict = add_c_common_code_original_info(data_dict=test_dict,
                                                    df=test)

    # valid_dict = {'error_code_word_id': vaild_data, 'includes': vaild['includes']}
    # test_dict = {'error_code_word_id': test_data, 'includes': test['includes']}

    # train_data_set = CCodeErrorDataSet(pd.DataFrame(train_dict), vocab, 'train')
    # valid_data_set = CCodeErrorDataSet(pd.DataFrame(valid_dict), vocab, 'all_valid')
    # test_data_set = CCodeErrorDataSet(pd.DataFrame(test_dict), vocab, 'all_test')

    return train_dict, valid_dict, test_dict
示例#24
0
def load_common_error_data_sample_with_encoder_copy_100(
        inner_begin_id, inner_end_id):
    vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'],
                                           end_tokens=['<END>'],
                                           unk_token='<UNK>',
                                           addition_tokens=['<GAP>'])
    train, vaild, test = read_fake_common_c_error_dataset_with_limit_length(
        MAX_TOKEN_LENGTH)
    train = convert_c_code_fields_to_cpp_fields(train)
    vaild = convert_c_code_fields_to_cpp_fields(vaild)
    test = convert_c_code_fields_to_cpp_fields(test)

    train = train.sample(100)
    vaild = vaild.sample(100)
    test = test.sample(100)

    tokenize_fn = tokenize_by_clex_fn()

    parse_param = [
        vocab, action_list_sorted, tokenize_fn, inner_begin_id, inner_end_id
    ]
    parse_test_param = [vocab, tokenize_fn]

    train_data = parse_error_tokens_and_action_map_encoder_copy(
        train, 'train', *parse_param)
    vaild_data = parse_error_tokens_and_action_map_encoder_copy(
        vaild, 'valid', *parse_param)
    test_data = parse_error_tokens_and_action_map_encoder_copy(
        test, 'test', *parse_param)
    # vaild_data = parse_test_tokens(vaild, 'valid', *parse_test_param)
    # test_data = parse_test_tokens(test, 'test', *parse_test_param)

    train = train.loc[train_data[0].index.values]
    vaild = vaild.loc[vaild_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_code_word_id': train_data[0],
        'ac_code_word_id': train_data[1],
        'token_map': train_data[2],
        'error_mask': train_data[3],
        'includes': train['includes'],
        'is_copy': train_data[4],
        'distance': train_data[5],
        'ac_code_target_id': train_data[6],
        'ac_code_target': train_data[7]
    }
    valid_dict = {
        'error_code_word_id': vaild_data[0],
        'ac_code_word_id': vaild_data[1],
        'token_map': vaild_data[2],
        'error_mask': vaild_data[3],
        'includes': vaild['includes'],
        'is_copy': vaild_data[4],
        'distance': vaild_data[5],
        'ac_code_target_id': vaild_data[6],
        'ac_code_target': vaild_data[7]
    }
    test_dict = {
        'error_code_word_id': test_data[0],
        'ac_code_word_id': test_data[1],
        'token_map': test_data[2],
        'error_mask': test_data[3],
        'includes': test['includes'],
        'is_copy': test_data[4],
        'distance': test_data[5],
        'ac_code_target_id': test_data[6],
        'ac_code_target': test_data[7]
    }
    # valid_dict = {'error_code_word_id': vaild_data, 'includes': vaild['includes']}
    # test_dict = {'error_code_word_id': test_data, 'includes': test['includes']}

    # train_data_set = CCodeErrorDataSet(pd.DataFrame(train_dict), vocab, 'train')
    # valid_data_set = CCodeErrorDataSet(pd.DataFrame(valid_dict), vocab, 'all_valid')
    # test_data_set = CCodeErrorDataSet(pd.DataFrame(test_dict), vocab, 'all_test')
    # print(train_data[0])

    return train_dict, valid_dict, test_dict
示例#25
0
def load_fake_deepfix_dataset_iterate_error_data(do_flatten=False,
                                                 merge_action=True,
                                                 sequence_output=False):
    vocab = load_deepfix_common_error_vocabulary()

    train, valid, test = read_fake_common_deepfix_error_dataset_with_limit_length(
        500)

    train = convert_c_code_fields_to_cpp_fields(train, convert_include=False)
    valid = convert_c_code_fields_to_cpp_fields(valid, convert_include=False)
    test = convert_c_code_fields_to_cpp_fields(test, convert_include=False)

    tokenize_fn = tokenize_by_clex_fn()
    parse_fn = parse_iterative_sample_action_error_code
    parse_param = [
        vocab, action_list_sorted_no_reverse, tokenize_fn, merge_action,
        sequence_output
    ]

    train_data = parse_fn(train, 'train', *parse_param)
    valid_data = parse_fn(valid, 'valid', *parse_param)
    test_data = parse_fn(test, 'test', *parse_param)

    train = train.loc[train_data[0].index.values]
    valid = valid.loc[valid_data[0].index.values]
    test = test.loc[test_data[0].index.values]

    train_dict = {
        'error_token_id_list': train_data[0],
        'sample_error_id_list': train_data[1],
        'sample_ac_id_list': train_data[2],
        'ac_pos_list': train_data[3],
        'error_pos_list': train_data[4],
        'includes': train['includes'],
        'distance': train['distance'],
        'ac_code_ids': train_data[5],
        'is_copy_list': train_data[6],
        'copy_pos_list': train_data[7],
        'sample_mask_list': train_data[8],
        'error_token_name_list': train_data[9],
        'id': train['id'],
        'target_ac_token_id_list': train_data[10],
        'ac_code_name_with_labels': train_data[11]
    }
    valid_dict = {
        'error_token_id_list': valid_data[0],
        'sample_error_id_list': valid_data[1],
        'sample_ac_id_list': valid_data[2],
        'ac_pos_list': valid_data[3],
        'error_pos_list': valid_data[4],
        'includes': valid['includes'],
        'distance': valid['distance'],
        'ac_code_ids': valid_data[5],
        'is_copy_list': valid_data[6],
        'copy_pos_list': valid_data[7],
        'sample_mask_list': valid_data[8],
        'error_token_name_list': valid_data[9],
        'id': valid['id'],
        'target_ac_token_id_list': valid_data[10],
        'ac_code_name_with_labels': valid_data[11]
    }
    test_dict = {
        'error_token_id_list': test_data[0],
        'sample_error_id_list': test_data[1],
        'sample_ac_id_list': test_data[2],
        'ac_pos_list': test_data[3],
        'error_pos_list': test_data[4],
        'includes': test['includes'],
        'distance': test['distance'],
        'ac_code_ids': test_data[5],
        'is_copy_list': test_data[6],
        'copy_pos_list': test_data[7],
        'sample_mask_list': test_data[8],
        'error_token_name_list': test_data[9],
        'id': test['id'],
        'target_ac_token_id_list': test_data[10],
        'ac_code_name_with_labels': test_data[11]
    }

    if do_flatten:
        train_dict = flatten_iterative_data(train_dict)
        valid_dict = flatten_iterative_data(valid_dict)
        test_dict = flatten_iterative_data(test_dict)

    return train_dict, valid_dict, test_dict
示例#26
0
def multi_step_evaluate(model,
                        dataset,
                        batch_size,
                        parse_input_batch_data_fn,
                        parse_target_batch_data_fn,
                        do_sample=False,
                        print_output=False,
                        create_output_ids_fn=None,
                        evaluate_obj_list=[],
                        expand_output_and_target_fn=None,
                        max_step_times=0,
                        vocabulary=None,
                        file_path='',
                        create_multi_step_next_input_batch_fn=None,
                        extract_includes_fn=lambda x: x['includes'],
                        print_output_fn=None,
                        do_beam_search=False,
                        target_file_path='main.out',
                        do_save_data=False,
                        max_save_distance=None,
                        save_records_to_database=False,
                        db_path='',
                        table_name='',
                        change_output_records_to_batch_fn=None,
                        create_save_database_records_fn=None):
    total_loss = to_cuda(torch.Tensor([0]))
    total_batch = to_cuda(torch.Tensor([0]))
    steps = 0
    compile_evaluator = CompileResultEvaluate()
    compile_evaluator.clear_result()
    for o in evaluate_object_list:
        o.clear_result()

    model.eval()

    from common.pycparser_util import tokenize_by_clex_fn
    tokenize_fn = tokenize_by_clex_fn()

    # file_path = add_pid_to_file_path(file_path)
    # target_file_path = add_pid_to_file_path(target_file_path)

    with tqdm(total=len(dataset)) as pbar:
        with torch.no_grad():
            for batch_data in data_loader(dataset,
                                          batch_size=batch_size,
                                          drop_last=False):
                model.zero_grad()

                input_data = batch_data.copy()
                final_output_list = []
                output_records_list = []
                continue_list = [True for _ in range(batch_size)]
                result_list = [False for _ in range(batch_size)]
                result_records_list = []
                sample_steps = [-1 for _ in range(batch_size)]

                for i in range(max_step_times):
                    model_input = parse_input_batch_data_fn(input_data,
                                                            do_sample=True)

                    model_output = model.forward(*model_input,
                                                 do_sample=True,
                                                 do_beam_search=do_beam_search)

                    input_data, final_output, output_records, final_output_name_list, continue_list = create_multi_step_next_input_batch_fn(
                        input_data, model_input, model_output, continue_list,
                        do_beam_search)
                    final_output_list += [final_output]
                    output_records_list += [output_records]

                    continue_list, result_list = compile_code_ids_list(
                        final_output_name_list,
                        continue_list,
                        result_list,
                        vocabulary=vocabulary,
                        includes_list=extract_includes_fn(input_data),
                        file_path=file_path,
                        target_file_path=target_file_path,
                        do_compile_pool=True,
                        need_transform=False)
                    sample_steps = [
                        i + 1 if s == -1 and not c else s
                        for s, c in zip(sample_steps, continue_list)
                    ]

                    result_records_list += [result_list]
                    if sum(continue_list) == 0:
                        break
                sample_steps = [
                    max_step_times if s == -1 else s for s in sample_steps
                ]

                step_output = 'in evaluate step {}: '.format(steps)
                res = compile_evaluator.add_result(result_list)
                step_output += res
                for evaluator in evaluate_obj_list:
                    # customer evaluator interface
                    res = evaluator.add_result(result_list,
                                               batch_data=batch_data)
                    step_output += res
                # print(step_output)
                info(step_output)

                if print_output and steps % 1 == 0:
                    print_output_fn(output_records=output_records_list,
                                    final_output=final_output_list,
                                    batch_data=batch_data,
                                    step_i=steps,
                                    vocabulary=vocabulary,
                                    compile_result_list=result_records_list)

                steps += 1
                pbar.update(batch_size)
    evaluate_obj_list = [compile_evaluator] + evaluate_obj_list

    t_loss = (total_loss / steps).item() if steps != 0 else 0
    return evaluate_obj_list, t_loss