def load_deepfix_ac_data_for_generator(filter_unk=False): vocab = create_deepfix_common_error_vocabulary( begin_tokens=['<BEGIN>', '<INNER_BEGIN>'], end_tokens=['<END>', '<INNER_END>'], unk_token='<UNK>', addition_tokens=['<PAD>']) df = read_deepfix_ac_data() df = convert_deepfix_to_c_code(df) tokenize_fn = tokenize_by_clex_fn() parse_test_param = [vocab, tokenize_fn, True] df_data = parse_test_tokens(df, 'deepfix', *parse_test_param) df_data = list(df_data) if filter_unk: unk_id = vocab.word_to_id(vocab.unk) df_data[0] = df_data[0][df_data[0].map(lambda x: unk_id not in x)] df = df.loc[df_data[0].index.values] deepfix_dict = { 'ac_token_id_list': df_data[0], 'includes': df['includes'], 'distance': df['errorcount'], 'ac_token_name_list': df_data[1] } return deepfix_dict
def load_deepfix_masked_datadict(sample_count=None): vocab = load_deepfix_common_error_vocabulary() train, valid, test = read_fake_common_deepfix_error_dataset_with_limit_length(500) tokenize_fn = tokenize_by_clex_fn() position_dict = read_deepfix_masked_position() def prepare_df(df): if sample_count is not None and sample_count > 0: df = df.sample(sample_count) df = convert_c_code_fields_to_cpp_fields(df, convert_include=False) df = add_masked_position_column(df, position_dict) return df train = prepare_df(train) valid = prepare_df(valid) test = prepare_df(test) parse_param = (vocab, tokenize_fn) train_data = parse_masked_code(train, *parse_param) valid_data = parse_masked_code(valid, *parse_param) test_data = parse_masked_code(test, *parse_param) train_dict = {**train_data, 'includes': train['includes'], 'id': train['id'], 'masked_positions': train['masked_positions']} valid_dict = {**valid_data, 'includes': valid['includes'], 'id': valid['id'], 'masked_positions': valid['masked_positions']} test_dict = {**test_data, 'includes': test['includes'], 'id': test['id'], 'masked_positions': test['masked_positions']} return train_dict, valid_dict, test_dict
def preprocess_code(code, cpp_file_path=COMPILE_TMP_PATH, tokenize_fn=None): if not compile_c_code_by_gcc(code, cpp_file_path): return None, None, None, None, None code = init_code(code) if not compile_c_code_by_gcc(code, cpp_file_path): return None, None, None, None, None before_code = code after_code = before_code error_count_range = (1, 9) if tokenize_fn is None: tokenize_fn = tokenize_by_clex_fn() count = 0 action_maplist = [] error_character_maplist = [] error_count = -1 while compile_c_code_by_gcc(after_code, cpp_file_path): cod = before_code # cod = remove_blank(cod) # cod = remove_comments(cod) # cod = remove_blank_line(cod) count += 1 # before_code = cod before_code, after_code, action_maplist, error_character_maplist, error_count = create_error_code( cod, error_count_range=error_count_range, tokenize_fn=tokenize_fn) if before_code is None: return None, None, None, None, None if count > compile_max_count: return None, None, None, None, None return before_code, after_code, action_maplist, error_character_maplist, error_count
def __init__(self, vocabulary, mask_language_model_param, detect_token_model_param, train_type, ignore_id, pad_id, check_error_task, only_predict_masked=False): ''' :param vocabulary: :param mask_language_model_param: :param detect_token_model_param: :param train_type: 'gene', 'only_disc', 'both', 'none' ''' super().__init__() self.vocabulary = vocabulary self.generator = MaskedLanguageModel(**mask_language_model_param) self.discriminator = ErrorDetectorModel(**detect_token_model_param) self.ignore_id = ignore_id self.pad_id = pad_id self.check_error_task = check_error_task self.only_predict_masked = only_predict_masked from common.pycparser_util import tokenize_by_clex_fn self.tokenize_fn = tokenize_by_clex_fn() self.train_type = '' self.change_model_train_type(train_type)
def tokenize_code(df): tokenize_fn = tokenize_by_clex_fn() df['similar_tokenize'] = df['similar_code_without_include'].map( tokenize_fn) df = df[df['similar_tokenize'].map(lambda x: x is not None)] df['sample_tokenize'] = df['sample_code'].map(tokenize_fn) df = df[df['sample_tokenize'].map(lambda x: x is not None)] return df
def read_fake_random_c_error_dataset_with_limit_length(limit_length=500): dfs = read_fake_random_c_error_dataset() tokenize_fn = tokenize_by_clex_fn() train, valid, test = [ filter_length(df, limit_length, tokenize_fn) for df in dfs ] return train, valid, test
def load_fake_deepfix_dataset_iterate_error_data(is_debug=False): vocab = create_deepfix_common_error_vocabulary( begin_tokens=['<BEGIN>', '<INNER_BEGIN>'], end_tokens=['<END>', '<INNER_END>'], unk_token='<UNK>', addition_tokens=['<PAD>']) train, valid, test = read_fake_common_deepfix_error_dataset_with_limit_length( 500) if is_debug: train = train.sample(100) valid = valid.sample(100) test = test.sample(100) tokenize_fn = tokenize_by_clex_fn() parse_fn = parse_xy_sequence add_begin_end_label = True parse_param = [vocab, tokenize_fn, add_begin_end_label] train_data = parse_fn(train, 'train', *parse_param) valid_data = parse_fn(valid, 'valid', *parse_param) test_data = parse_fn(test, 'test', *parse_param) train = train.loc[train_data[0].index.values] valid = valid.loc[valid_data[0].index.values] test = test.loc[test_data[0].index.values] train_dict = { 'error_token_id_list': train_data[0], 'error_token_name_list': train_data[1], 'target_token_id_list': train_data[2], 'target_token_name_list': train_data[3], 'includes': train['includes'], 'distance': train['distance'], 'id': train['id'], } valid_dict = { 'error_token_id_list': valid_data[0], 'error_token_name_list': valid_data[1], 'target_token_id_list': valid_data[2], 'target_token_name_list': valid_data[3], 'includes': valid['includes'], 'distance': valid['distance'], 'id': valid['id'], } test_dict = { 'error_token_id_list': test_data[0], 'error_token_name_list': test_data[1], 'target_token_id_list': test_data[2], 'target_token_name_list': test_data[3], 'includes': test['includes'], 'distance': test['distance'], 'id': test['id'], } return train_dict, valid_dict, test_dict
def read_filter_without_include_ac_token(): train_df, _, _ = read_fake_common_c_error_dataset_with_limit_length(500) transform_lextoken_to_token_fn = lambda token_list: [ i.value for i in token_list ] tokenize_fn = tokenize_by_clex_fn() parse_tokens = [ transform_lextoken_to_token_fn(tokenize_fn(code)) for code in train_df['similar_code'] ] return parse_tokens
def get_deepfix_train_error_tokens_without_includes(): train_df, _, _ = read_fake_common_deepfix_error_dataset_with_limit_length( 500) transform_lextoken_to_token_fn = lambda token_list: [ i.value for i in token_list ] tokenize_fn = tokenize_by_clex_fn() parse_tokens = [ transform_lextoken_to_token_fn(tokenize_fn(code)) for code in train_df['code'] ] return parse_tokens
def read_fake_common_deepfix_error_dataset_with_limit_length(limit_length=500, random_seed=100): data_df = read_fake_deepfix_common_error_records() tokenize_fn = tokenize_by_clex_fn() data_df = filter_length(data_df, limit_length, tokenize_fn) print('after filter code length: {}'.format(len(data_df))) valid_df = data_df.sample(frac=0.05, random_state=random_seed) data_df = data_df.drop(valid_df.index) test_df = data_df.sample(frac=0.05, random_state=random_seed) train_df = data_df.drop(test_df.index) return train_df, valid_df, test_df
def sample_masked_position_main(): data_df = read_fake_deepfix_common_error_records() data_df = convert_c_code_fields_to_cpp_fields(data_df) tokenize_fn = tokenize_by_clex_fn() data_df = tokenize_ac_code(data_df, tokenize_fn) data_df['ac_code_length'] = data_df['ac_code_name'].map(len) data_df['masked_positions'] = data_df['ac_code_length'].map(lambda l: random_position(l, frac=0.4)) data_df['masked_positions_token'] = data_df.apply(lambda one: [one['ac_code_name'][pos] for pos in one['masked_positions']], axis=1) data_dict = {one['id']: (one['masked_positions'], one['masked_positions_token']) for i, one in data_df.iterrows()} # data_dict = {i: (masked_poses, masked_toks) for i, masked_poses, masked_toks in zip( # data_df['id'].tolist(), data_df['masked_positions'].tolist(), data_df['masked_tokens'].tolist())} save_sample_masked_position_dict(data_dict, save_path=deepfix_masked_position_path)
def main(): begin_tokens = ['<BEGIN>'] end_tokens = ['<END>'] unk_token = '<UNK>' addition_tokens = ['<GAP>'] vocabulary = create_common_error_vocabulary( begin_tokens=begin_tokens, end_tokens=end_tokens, unk_token=unk_token, addition_tokens=addition_tokens) tokenize_fn = tokenize_by_clex_fn() transformer = TransformVocabularyAndSLK(vocabulary, tokenize_fn) code = r''' int main(){ int a = 0; a = 1 ''' token_list = tokenize_fn(code) print(token_list) token_list = iter(token_list) t_parser = transformer.create_new_slk_iterator() for t, type_id in t_parser: print(t) print(type_id) try: tt = next(token_list) print(tt) t_parser.add_token(tt) except StopIteration: break t_parser_1 = copy.deepcopy(t_parser) t_parser_2 = copy.deepcopy(t_parser) t_parser_1.add_token( transformer.id_to_token_dict[transformer.vocab.word_to_id('+')]) t, type_id = next(t_parser_1) print('t1', t) print('t1', type_id) t_parser_2.add_token( transformer.id_to_token_dict[transformer.vocab.word_to_id(';')]) t, type_id = next(t_parser_2) print('t2', t) print('t2', type_id)
def load_grammar_sample_common_error_data(): """ not finish :return: """ vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'], end_tokens=['<END>'], unk_token='<UNK>', addition_tokens=['<GAP>']) train_df, valid_df, test_df = read_grammar_sample_error_data() train_df = convert_c_code_fields_to_cpp_fields(train_df, convert_include=False) valid_df = convert_c_code_fields_to_cpp_fields(valid_df, convert_include=False) test_df = convert_c_code_fields_to_cpp_fields(test_df, convert_include=False) tokenize_fn = tokenize_by_clex_fn()
def read_deepfix_dataset(): tokenize_fn = tokenize_by_clex_fn() vocabulary = create_deepfix_common_error_vocabulary( begin_tokens=['<BEGIN>', '<INNER_BEGIN>'], end_tokens=['<END>', '<INNER_END>'], unk_token='<UNK>', addition_tokens=['<PAD>']) transformer = TransformVocabularyAndSLK(tokenize_fn=tokenize_fn, vocab=vocabulary) train_dataset, valid_dataset, test_dataset, _ = load_deepfix_sample_iterative_dataset( is_debug=False, vocabulary=vocabulary, mask_transformer=transformer, do_flatten=True, use_ast=True, do_multi_step_sample=False, merge_action=False) return train_dataset, valid_dataset, test_dataset
def load_deepfix_error_data(): vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'], end_tokens=['<END>'], unk_token='<UNK>', addition_tokens=['<GAP>']) df = read_deepfix_error_data() df = convert_deepfix_to_c_code(df) tokenize_fn = tokenize_by_clex_fn() parse_test_param = [vocab, tokenize_fn] df_data = parse_test_tokens(df, 'deepfix', *parse_test_param) df = df.loc[df_data[0].index.values] deepfix_dict = { 'error_code_word_id': df_data[0], 'includes': df['includes'], 'distance': df['errorcount'], 'error_code_word_name': df_data[1] } return deepfix_dict
def load_generate_code_for_solver_model_iterate_data(df, convert_field_fn=None, convert_field_dict={}, do_flatten=False, vocabulary=None): if convert_field_fn is not None: df = convert_field_fn(df, **convert_field_dict) df['action_character_list'] = df['action_character_list'].map( convert_action_map_to_old_action) tokenize_fn = tokenize_by_clex_fn() parse_fn = parse_iterative_sample_action_error_code parse_param = [vocabulary, action_list_sorted_no_reverse, tokenize_fn] df_data = parse_fn(df, 'train', *parse_param) df = df.loc[df_data[0].index.values] df_dict = { 'error_token_id_list': df_data[0], 'sample_error_id_list': df_data[1], 'sample_ac_id_list': df_data[2], 'ac_pos_list': df_data[3], 'error_pos_list': df_data[4], 'includes': df['includes'], 'distance': df['distance'], 'ac_code_ids': df_data[5], 'is_copy_list': df_data[6], 'copy_pos_list': df_data[7], 'sample_mask_list': df_data[8], 'error_token_name_list': df_data[9], 'id': df['id'], 'target_ac_token_id_list': df_data[10], 'ac_code_name_with_labels': df_data[11], } if do_flatten: df_dict = flatten_iterative_data(df_dict) return df_dict
def load_customer_code_data_for_iterate(df): vocab = create_deepfix_common_error_vocabulary( begin_tokens=['<BEGIN>', '<INNER_BEGIN>'], end_tokens=['<END>', '<INNER_END>'], unk_token='<UNK>', addition_tokens=['<PAD>']) # df = read_deepfix_error_data() df = convert_deepfix_to_c_code(df) tokenize_fn = tokenize_by_clex_fn() parse_test_param = [vocab, tokenize_fn, True] df_data = parse_test_tokens(df, 'deepfix', *parse_test_param) df = df.loc[df_data[0].index.values] deepfix_dict = { 'error_token_id_list': df_data[0], 'includes': df['includes'], 'distance': df['errorcount'], 'error_token_name_list': df_data[1], 'id': df['code_id'] } return deepfix_dict
def make_fake_code(que_read: mp.Queue, que_write: mp.Queue, ind: int): preprocess_logger.info('Start Make Fake Code Process {}'.format(ind)) tmp_code_file_path = os.path.join(COMPILE_TMP_PATH, 'code' + str(ind) + '.c') timeout_count = 0 count = 0 success_count = 0 err_count = 0 fail_count = 0 repeat_count = 0 tokenize_fn = tokenize_by_clex_fn() while True: if timeout_count >= 5: break if count % 10 == 0: preprocess_logger.info( "Process {} | count: {} | error_count: {} | fail_count: {} | repeat_count: {}" .format(ind, count, err_count, fail_count, repeat_count)) try: item = que_read.get(timeout=600) except queue.Empty: timeout_count += 1 continue except TimeoutError: timeout_count += 1 continue timeout_count = 0 count += 1 if not item: repeat_count += 1 que_write.put(None) continue # item['originalcode'] = item['originalcode'].replace('\ufeff', '').replace('\u3000', ' ') try: before_code, after_code, action_maplist, error_character_maplist, error_count = preprocess_code( item['originalcode'], cpp_file_path=tmp_code_file_path, tokenize_fn=tokenize_fn) except Exception as e: preprocess_logger.info('error info: ' + str(e)) before_code = None after_code = None action_maplist = None error_character_maplist = None error_count = 1 count += 1 if before_code: success_count += 1 item['ac_code'] = before_code item['code'] = after_code item['error_count'] = error_count error_list = list( map(lambda x: x.__dict__(), error_character_maplist)) action_list = list(map(lambda x: x.__dict__(), action_maplist)) item['error_character_maplist'] = error_list item['action_maplist'] = action_list que_write.put(item) else: item['try_count'] += 1 if item['try_count'] < error_max_count: err_count += 1 que_read.put(item) else: fail_count += 1 que_write.put(None) preprocess_logger.info( "Process {} | count: {} | error_count: {} | fail_count: {} | repeat_count: {}" .format(ind, count, err_count, fail_count, repeat_count)) preprocess_logger.info('End Make Fake Code Process {}'.format(ind))
def multi_step_evaluate(model, dataset, batch_size, parse_input_batch_data_fn, parse_target_batch_data_fn, do_sample=False, print_output=False, create_output_ids_fn=None, evaluate_obj_list=[], expand_output_and_target_fn=None, max_step_times=0, vocabulary=None, file_path='', create_multi_step_next_input_batch_fn=None, extract_includes_fn=lambda x: x['includes'], print_output_fn=None, do_beam_search=False, target_file_path='main.out', log_file_path='main.log', do_save_data=False, max_save_distance=None, save_records_to_database=False, db_path='', table_name='', change_output_records_to_batch_fn=None, create_save_database_records_fn=None, error_stop_type='normal'): total_loss = to_cuda(torch.Tensor([0])) total_batch = to_cuda(torch.Tensor([0])) steps = 0 compile_evaluator = CompileResultEvaluate() compile_evaluator.clear_result() for o in evaluate_obj_list: o.clear_result() model.eval() from common.pycparser_util import tokenize_by_clex_fn tokenize_fn = tokenize_by_clex_fn() save_data_dict = {} save_records_list = [] # file_path = add_pid_to_file_path(file_path) # target_file_path = add_pid_to_file_path(target_file_path) with tqdm(total=len(dataset)) as pbar: with torch.no_grad(): for batch_data in data_loader(dataset, batch_size=batch_size, drop_last=False): model.zero_grad() input_data = batch_data.copy() final_output_list = [] output_records_list = [] continue_list = [True for _ in range(batch_size)] result_list = [False for _ in range(batch_size)] result_records_list = [] sample_steps = [-1 for _ in range(batch_size)] error_count_list = batch_data['error_count'] for i in range(max_step_times): model_input = parse_input_batch_data_fn(input_data, do_sample=True) model_output = model.forward(*model_input, do_sample=True, do_beam_search=do_beam_search) input_data, final_output, output_records, final_output_name_list, continue_list = create_multi_step_next_input_batch_fn( input_data, model_input, model_output, continue_list, do_beam_search) final_output_list += [final_output] output_records_list += [output_records] continue_list, result_list, cur_error_count_list = compile_code_ids_list( final_output_name_list, continue_list, result_list, vocabulary=vocabulary, includes_list=extract_includes_fn(input_data), file_path=file_path, target_file_path=target_file_path, log_file_path=log_file_path, do_compile_pool=True, need_transform=False) if error_stop_type == 'oracle': reject_list = [ True if c and n > o else False for c, o, n in zip(continue_list, error_count_list, cur_error_count_list) ] elif error_stop_type == 'normal': reject_list = [False for _ in range(batch_size)] error_count_list = [ n if n < o and n >= 0 else o for o, n in zip(error_count_list, cur_error_count_list) ] for i_f, rej in enumerate(reject_list): if rej: # use last output final_output_name_list[i_f] = input_data[ 'last_input_seq_name'][i_f] continue_list[i_f] = False sample_steps = [ i + 1 if s == -1 and not c and not r else s for s, c, r in zip(sample_steps, continue_list, reject_list) ] sample_steps = [ i if s == -1 and not c and r else s for s, c, r in zip( sample_steps, continue_list, reject_list) ] result_records_list += [result_list] if sum(continue_list) == 0: break sample_steps = [ max_step_times if s == -1 else s for s in sample_steps ] if do_save_data: batch_data['input_seq_name'] = batch_data[ 'final_output_name'] save_res_dict = save_addition_data( original_states=batch_data, states=input_data, tokenize_fn=tokenize_fn, batch_size=batch_size, file_path=file_path, target_file_path=target_file_path, vocabulary=vocabulary, max_distande=max_save_distance, only_error=True) for k, v in save_res_dict.items(): save_data_dict[k] = save_data_dict.get(k, []) + v if save_records_to_database: batch_output_records = change_output_records_to_batch_fn( output_records_list, sample_steps) records_list = create_save_database_records_fn( batch_data, sample_steps, final_output_name_list, result_list, batch_output_records, input_data) save_records_list += records_list step_output = 'in evaluate step {}: '.format(steps) res = compile_evaluator.add_result(result_list) step_output += res for evaluator in evaluate_obj_list: # customer evaluator interface res = evaluator.add_result(result_list, batch_data=batch_data) step_output += res # print(step_output) info(step_output) if print_output and steps % 1 == 0: print_output_fn(output_records=output_records_list, final_output=final_output_list, batch_data=batch_data, step_i=steps, vocabulary=vocabulary, compile_result_list=result_records_list) steps += 1 pbar.update(batch_size) evaluate_obj_list = [compile_evaluator] + evaluate_obj_list if save_records_to_database: create_table(db_path, DATA_RECORDS_DEEPFIX, replace_table_name=table_name) run_sql_statment(db_path, DATA_RECORDS_DEEPFIX, 'insert_ignore', save_records_list, replace_table_name=table_name) if steps == 0: t_loss = 0 else: t_loss = (total_loss / steps).item() return evaluate_obj_list, t_loss, save_data_dict
test_dataset = IterateErrorDataSet( df, vocabulary, 'deepfix', transformer_vocab_slk=mask_transformer, do_flatten=do_flatten, use_ast=use_ast, do_multi_step_sample=do_multi_step_sample) info_output = "There are {} parsed data in the deepfix dataset".format( len(test_dataset)) print(info_output) return None, None, test_dataset, None if __name__ == '__main__': vocab = create_deepfix_common_error_vocabulary( begin_tokens=['<BEGIN>', '<INNER_BEGIN>'], end_tokens=['<END>', '<INNER_END>'], unk_token='<UNK>', addition_tokens=['<PAD>']) tokenize_fn = tokenize_by_clex_fn() transformer = TransformVocabularyAndSLK(tokenize_fn=tokenize_fn, vocab=vocab) train_dataset = load_deepfix_ac_code_for_generate_dataset( is_debug=True, vocabulary=vocab, mask_transformer=transformer, do_flatten=True, use_ast=False) print(len(train_dataset))
def create_error_code(code, error_type_list=(5, 1, 4), error_count_range=(1, 5), tokenize_fn=None): code_without_include = replace_include_with_blank(code) include_lines = extract_include(code) include_line_nos = analyse_include_line_no(code, include_lines) try: if tokenize_fn is None: tokenize_fn = tokenize_by_clex_fn() code_tokens = tokenize_fn(code_without_include) if code_tokens is None or len(code_tokens) > 1000: # preprocess_logger.info('code tokens is None: {}'.format(code_without_include)) preprocess_logger.info('code tokens is None') return None, None, None, None, None except Exception as e: preprocess_logger.info('tokenize code error.') return None, None, None, None, None error_count = random.randint(*error_count_range) action_maplist = create_multi_error(code_without_include, code_tokens, error_type_list, error_count) # action_mapposlist = list(map(lambda x: x.get_ac_pos(), action_maplist)) error_character_maplist = [] # ac_code_list = list(code) # # ac_i = 0 # err_i = 0 # def get_action(act_type, ac_pos): # for i in action_maplist: # if act_type == i.act_type and ac_pos == i.ac_pos: # return i # return None # for ac_i in range(len(ac_code_list)): # while ac_i < len(ac_code_list): # if ac_i in action_mapposlist and get_action(act_type=DELETE, ac_pos=ac_i) != None: # action = get_action(act_type=DELETE, ac_pos=ac_i) # error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i) # err_i += 1 # # action.err_pos = err_i # ac_i += len(action.from_char) # # error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i) # err_i += 1 # continue # # if ac_i in action_mapposlist and get_action(act_type=INSERT, ac_pos=ac_i) != None: # action = get_action(act_type=INSERT, ac_pos=ac_i) # error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i) # err_i += 1 # # action.err_pos = err_i # for i in range(len(action.to_char)): # err_item = ERROR_CHARACTER_MAPITEM(act_type=INSERT, from_char=action.to_char[i], err_pos=err_i, ac_pos=ac_i) # error_character_maplist.append(err_item) # err_i += 1 # # error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i) # err_i += 1 # # if ac_i in action_mapposlist and get_action(act_type=CHANGE, ac_pos=ac_i) != None: # action = get_action(act_type=CHANGE, ac_pos=ac_i) # error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i) # err_i += 1 # # action.err_pos = err_i # for i in range(len(action.to_char)): # err_item = ERROR_CHARACTER_MAPITEM(act_type=CHANGE, from_char=action.to_char[i], err_pos=err_i, to_char=action.from_char, ac_pos=ac_i) # err_i += 1 # error_character_maplist.append(err_item) # ac_i += len(action.from_char) # # error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i) # err_i += 1 # # else: # err_item = ERROR_CHARACTER_MAPITEM(act_type=STAY, from_char=code[ac_i], err_pos=err_i, to_char=code[ac_i], # ac_pos=ac_i) # err_i += 1 # error_character_maplist.append(err_item) # ac_i += 1 # # if ac_i in action_mapposlist and get_action(act_type=INSERT, ac_pos=ac_i) != None: # action = get_action(act_type=INSERT, ac_pos=ac_i) # error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i) # err_i += 1 # # action.err_pos = err_i # for i in range(len(action.to_char)): # err_item = ERROR_CHARACTER_MAPITEM(act_type=INSERT, from_char=action.to_char[i], err_pos=err_i, ac_pos=ac_i) # error_character_maplist.append(err_item) # err_i += 1 # # error_character_maplist = fill_blank_to_error_code(error_character_maplist, ac_i, err_i) # err_i += 1 def convert_action_list_to_operation_tuple(one_action): val = None if one_action.act_type == INSERT or one_action.act_type == CHANGE: val = one_action.to_char elif one_action.act_type == DELETE: val = one_action.from_char return [one_action.act_type, one_action.token_pos, val] operation_list = [ convert_action_list_to_operation_tuple(act) for act in action_maplist ] error_tokens, _ = generate_token_action(operation_list, tokens=code_tokens) if error_tokens is None: return None, None, None, None, None error_code = build_code_string_from_lex_tokens(error_tokens) error_lines = error_code.split('\n') for name, line_no in zip(include_lines, include_line_nos): if error_lines[line_no].strip() == '': error_lines[line_no] = name else: # preprocess_logger.info('tokens: {}'.format(error_tokens)) # preprocess_logger.info('code: {}'.format(error_code)) # preprocess_logger.info('extract include: {}'.format(include_lines)) preprocess_logger.info( 'extract include lineno: {}'.format(include_line_nos)) preprocess_logger.info('add include error: {}'.format( error_lines[line_no])) error_code = '\n'.join(error_lines) # error_code = ''.join(list(map(lambda x: x.from_char, error_character_maplist))) return code, error_code, action_maplist, error_character_maplist, error_count
def create_default_graph(token_list): from common.pycparser_util import tokenize_by_clex_fn tokenize_fn = tokenize_by_clex_fn() tokens = tokenize_fn("\n" + " ".join(token_list)) return CodeGraph(tokens, [(0, 1)], add_sequence_link=ast_config()['add_sequence_link'])
def load_common_error_data(addition_infomation=False, data_type=None): if data_type == 'deepfix': vocab = create_deepfix_common_error_vocabulary( begin_tokens=['<BEGIN>', '<INNER_BEGIN>'], end_tokens=['<END>', '<INNER_END>'], unk_token='<UNK>', addition_tokens=['<PAD>']) train, vaild, test = read_fake_common_deepfix_error_dataset_with_limit_length( 500) train = convert_c_code_fields_to_cpp_fields(train, convert_include=False) vaild = convert_c_code_fields_to_cpp_fields(vaild, convert_include=False) test = convert_c_code_fields_to_cpp_fields(test, convert_include=False) else: vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'], end_tokens=['<END>'], unk_token='<UNK>', addition_tokens=['<GAP>']) train, vaild, test = read_fake_common_c_error_dataset_with_limit_length( MAX_TOKEN_LENGTH) train = convert_c_code_fields_to_cpp_fields(train) vaild = convert_c_code_fields_to_cpp_fields(vaild) test = convert_c_code_fields_to_cpp_fields(test) tokenize_fn = tokenize_by_clex_fn() parse_param = [vocab, action_list_sorted, tokenize_fn] parse_test_param = [vocab, tokenize_fn] train_data = parse_error_tokens_and_action_map(train, 'train', *parse_param) vaild_data = parse_error_tokens_and_action_map(vaild, 'valid', *parse_param) test_data = parse_error_tokens_and_action_map(test, 'test', *parse_param) # vaild_data = parse_test_tokens(vaild, 'valid', *parse_test_param) # test_data = parse_test_tokens(test, 'test', *parse_test_param) train = train.loc[train_data[0].index.values] vaild = vaild.loc[vaild_data[0].index.values] test = test.loc[test_data[0].index.values] train_dict = { 'error_code_word_id': train_data[0], 'ac_code_word_id': train_data[1], 'token_map': train_data[2], 'error_mask': train_data[3], 'includes': train['includes'], 'is_copy': train_data[4], 'pointer_map': train_data[5], 'distance': train_data[6], 'error_code_word': train_data[7] } valid_dict = { 'error_code_word_id': vaild_data[0], 'ac_code_word_id': vaild_data[1], 'token_map': vaild_data[2], 'error_mask': vaild_data[3], 'includes': vaild['includes'], 'is_copy': vaild_data[4], 'pointer_map': vaild_data[5], 'distance': vaild_data[6], 'error_code_word': vaild_data[7] } test_dict = { 'error_code_word_id': test_data[0], 'ac_code_word_id': test_data[1], 'token_map': test_data[2], 'error_mask': test_data[3], 'includes': test['includes'], 'is_copy': test_data[4], 'pointer_map': test_data[5], 'distance': test_data[6], 'error_code_word': test_data[7] } if addition_infomation: train_dict = add_c_common_code_original_info(data_dict=train_dict, df=train) valid_dict = add_c_common_code_original_info(data_dict=valid_dict, df=vaild) test_dict = add_c_common_code_original_info(data_dict=test_dict, df=test) # valid_dict = {'error_code_word_id': vaild_data, 'includes': vaild['includes']} # test_dict = {'error_code_word_id': test_data, 'includes': test['includes']} # train_data_set = CCodeErrorDataSet(pd.DataFrame(train_dict), vocab, 'train') # valid_data_set = CCodeErrorDataSet(pd.DataFrame(valid_dict), vocab, 'all_valid') # test_data_set = CCodeErrorDataSet(pd.DataFrame(test_dict), vocab, 'all_test') return train_dict, valid_dict, test_dict
def load_common_error_data_sample_with_encoder_copy_100( inner_begin_id, inner_end_id): vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'], end_tokens=['<END>'], unk_token='<UNK>', addition_tokens=['<GAP>']) train, vaild, test = read_fake_common_c_error_dataset_with_limit_length( MAX_TOKEN_LENGTH) train = convert_c_code_fields_to_cpp_fields(train) vaild = convert_c_code_fields_to_cpp_fields(vaild) test = convert_c_code_fields_to_cpp_fields(test) train = train.sample(100) vaild = vaild.sample(100) test = test.sample(100) tokenize_fn = tokenize_by_clex_fn() parse_param = [ vocab, action_list_sorted, tokenize_fn, inner_begin_id, inner_end_id ] parse_test_param = [vocab, tokenize_fn] train_data = parse_error_tokens_and_action_map_encoder_copy( train, 'train', *parse_param) vaild_data = parse_error_tokens_and_action_map_encoder_copy( vaild, 'valid', *parse_param) test_data = parse_error_tokens_and_action_map_encoder_copy( test, 'test', *parse_param) # vaild_data = parse_test_tokens(vaild, 'valid', *parse_test_param) # test_data = parse_test_tokens(test, 'test', *parse_test_param) train = train.loc[train_data[0].index.values] vaild = vaild.loc[vaild_data[0].index.values] test = test.loc[test_data[0].index.values] train_dict = { 'error_code_word_id': train_data[0], 'ac_code_word_id': train_data[1], 'token_map': train_data[2], 'error_mask': train_data[3], 'includes': train['includes'], 'is_copy': train_data[4], 'distance': train_data[5], 'ac_code_target_id': train_data[6], 'ac_code_target': train_data[7] } valid_dict = { 'error_code_word_id': vaild_data[0], 'ac_code_word_id': vaild_data[1], 'token_map': vaild_data[2], 'error_mask': vaild_data[3], 'includes': vaild['includes'], 'is_copy': vaild_data[4], 'distance': vaild_data[5], 'ac_code_target_id': vaild_data[6], 'ac_code_target': vaild_data[7] } test_dict = { 'error_code_word_id': test_data[0], 'ac_code_word_id': test_data[1], 'token_map': test_data[2], 'error_mask': test_data[3], 'includes': test['includes'], 'is_copy': test_data[4], 'distance': test_data[5], 'ac_code_target_id': test_data[6], 'ac_code_target': test_data[7] } # valid_dict = {'error_code_word_id': vaild_data, 'includes': vaild['includes']} # test_dict = {'error_code_word_id': test_data, 'includes': test['includes']} # train_data_set = CCodeErrorDataSet(pd.DataFrame(train_dict), vocab, 'train') # valid_data_set = CCodeErrorDataSet(pd.DataFrame(valid_dict), vocab, 'all_valid') # test_data_set = CCodeErrorDataSet(pd.DataFrame(test_dict), vocab, 'all_test') # print(train_data[0]) return train_dict, valid_dict, test_dict
def load_fake_deepfix_dataset_iterate_error_data(do_flatten=False, merge_action=True, sequence_output=False): vocab = load_deepfix_common_error_vocabulary() train, valid, test = read_fake_common_deepfix_error_dataset_with_limit_length( 500) train = convert_c_code_fields_to_cpp_fields(train, convert_include=False) valid = convert_c_code_fields_to_cpp_fields(valid, convert_include=False) test = convert_c_code_fields_to_cpp_fields(test, convert_include=False) tokenize_fn = tokenize_by_clex_fn() parse_fn = parse_iterative_sample_action_error_code parse_param = [ vocab, action_list_sorted_no_reverse, tokenize_fn, merge_action, sequence_output ] train_data = parse_fn(train, 'train', *parse_param) valid_data = parse_fn(valid, 'valid', *parse_param) test_data = parse_fn(test, 'test', *parse_param) train = train.loc[train_data[0].index.values] valid = valid.loc[valid_data[0].index.values] test = test.loc[test_data[0].index.values] train_dict = { 'error_token_id_list': train_data[0], 'sample_error_id_list': train_data[1], 'sample_ac_id_list': train_data[2], 'ac_pos_list': train_data[3], 'error_pos_list': train_data[4], 'includes': train['includes'], 'distance': train['distance'], 'ac_code_ids': train_data[5], 'is_copy_list': train_data[6], 'copy_pos_list': train_data[7], 'sample_mask_list': train_data[8], 'error_token_name_list': train_data[9], 'id': train['id'], 'target_ac_token_id_list': train_data[10], 'ac_code_name_with_labels': train_data[11] } valid_dict = { 'error_token_id_list': valid_data[0], 'sample_error_id_list': valid_data[1], 'sample_ac_id_list': valid_data[2], 'ac_pos_list': valid_data[3], 'error_pos_list': valid_data[4], 'includes': valid['includes'], 'distance': valid['distance'], 'ac_code_ids': valid_data[5], 'is_copy_list': valid_data[6], 'copy_pos_list': valid_data[7], 'sample_mask_list': valid_data[8], 'error_token_name_list': valid_data[9], 'id': valid['id'], 'target_ac_token_id_list': valid_data[10], 'ac_code_name_with_labels': valid_data[11] } test_dict = { 'error_token_id_list': test_data[0], 'sample_error_id_list': test_data[1], 'sample_ac_id_list': test_data[2], 'ac_pos_list': test_data[3], 'error_pos_list': test_data[4], 'includes': test['includes'], 'distance': test['distance'], 'ac_code_ids': test_data[5], 'is_copy_list': test_data[6], 'copy_pos_list': test_data[7], 'sample_mask_list': test_data[8], 'error_token_name_list': test_data[9], 'id': test['id'], 'target_ac_token_id_list': test_data[10], 'ac_code_name_with_labels': test_data[11] } if do_flatten: train_dict = flatten_iterative_data(train_dict) valid_dict = flatten_iterative_data(valid_dict) test_dict = flatten_iterative_data(test_dict) return train_dict, valid_dict, test_dict
def multi_step_evaluate(model, dataset, batch_size, parse_input_batch_data_fn, parse_target_batch_data_fn, do_sample=False, print_output=False, create_output_ids_fn=None, evaluate_obj_list=[], expand_output_and_target_fn=None, max_step_times=0, vocabulary=None, file_path='', create_multi_step_next_input_batch_fn=None, extract_includes_fn=lambda x: x['includes'], print_output_fn=None, do_beam_search=False, target_file_path='main.out', do_save_data=False, max_save_distance=None, save_records_to_database=False, db_path='', table_name='', change_output_records_to_batch_fn=None, create_save_database_records_fn=None): total_loss = to_cuda(torch.Tensor([0])) total_batch = to_cuda(torch.Tensor([0])) steps = 0 compile_evaluator = CompileResultEvaluate() compile_evaluator.clear_result() for o in evaluate_object_list: o.clear_result() model.eval() from common.pycparser_util import tokenize_by_clex_fn tokenize_fn = tokenize_by_clex_fn() # file_path = add_pid_to_file_path(file_path) # target_file_path = add_pid_to_file_path(target_file_path) with tqdm(total=len(dataset)) as pbar: with torch.no_grad(): for batch_data in data_loader(dataset, batch_size=batch_size, drop_last=False): model.zero_grad() input_data = batch_data.copy() final_output_list = [] output_records_list = [] continue_list = [True for _ in range(batch_size)] result_list = [False for _ in range(batch_size)] result_records_list = [] sample_steps = [-1 for _ in range(batch_size)] for i in range(max_step_times): model_input = parse_input_batch_data_fn(input_data, do_sample=True) model_output = model.forward(*model_input, do_sample=True, do_beam_search=do_beam_search) input_data, final_output, output_records, final_output_name_list, continue_list = create_multi_step_next_input_batch_fn( input_data, model_input, model_output, continue_list, do_beam_search) final_output_list += [final_output] output_records_list += [output_records] continue_list, result_list = compile_code_ids_list( final_output_name_list, continue_list, result_list, vocabulary=vocabulary, includes_list=extract_includes_fn(input_data), file_path=file_path, target_file_path=target_file_path, do_compile_pool=True, need_transform=False) sample_steps = [ i + 1 if s == -1 and not c else s for s, c in zip(sample_steps, continue_list) ] result_records_list += [result_list] if sum(continue_list) == 0: break sample_steps = [ max_step_times if s == -1 else s for s in sample_steps ] step_output = 'in evaluate step {}: '.format(steps) res = compile_evaluator.add_result(result_list) step_output += res for evaluator in evaluate_obj_list: # customer evaluator interface res = evaluator.add_result(result_list, batch_data=batch_data) step_output += res # print(step_output) info(step_output) if print_output and steps % 1 == 0: print_output_fn(output_records=output_records_list, final_output=final_output_list, batch_data=batch_data, step_i=steps, vocabulary=vocabulary, compile_result_list=result_records_list) steps += 1 pbar.update(batch_size) evaluate_obj_list = [compile_evaluator] + evaluate_obj_list t_loss = (total_loss / steps).item() if steps != 0 else 0 return evaluate_obj_list, t_loss