def save_c_submit_code(data_df_list): create_table(COMPILE_SUCCESS_DATA_DBPATH, C_COMPILE_SUCCESS_RECORDS) result_list = [ data_df['gcc_compile_result'].map(lambda x: 1 if x else 0) for data_df in data_df_list ] count_list = [len(data_df) for data_df in data_df_list] success_res = np.sum(result_list) count_res = np.sum(count_list) print('success_res total: {}, total: {}'.format(success_res, count_res)) def trans(error_df, reverse_verdict, reverse_langdict): res = [ transform_data(row, reverse_verdict, reverse_langdict) for index, row in error_df.iterrows() ] return res reverse_verdict = reverse_dict(verdict) reverse_langdict = reverse_dict(langdict) data_items_list = [ trans(data_df, reverse_verdict, reverse_langdict) for data_df in data_df_list ] for data_items in data_items_list: insert_items(COMPILE_SUCCESS_DATA_DBPATH, C_COMPILE_SUCCESS_RECORDS, data_items)
def resave_database_main(to_db_path, to_table_name, params_string: dict={}, params_number: dict={}): params_s = add_params(params_string, need_quot=True) params_n = add_params(params_number, need_quot=False) if params_s != '' and params_n != '': params = params_s + ' and ' + params_n else: params = params_s if params_s != '' else params_n parmas = ' where ' + params if params != '' else params conn = sqlite3.connect(scrapyOJ_path) print('start read sql') df = pd.read_sql('select * from {}{}'.format('submit', parmas), conn) print('total df length: {}'.format(len(df))) df = df[df['code'].map(lambda x: x != '')] print('no empty df length: {}'.format(len(df))) df_dict = df.to_dict(orient='list') del df print('finish filter') create_table(to_db_path, to_table_name) header_list = ['id', 'submit_url', 'submit_time', 'user_id', 'user_name', 'problem_id', 'problem_url', 'problem_name', 'problem_full_name', 'language', 'status', 'error_test_id', 'time', 'memory', 'code'] total_list = [df_dict[key] for key in header_list] total_list = list(zip(*total_list)) print('start save') insert_items(to_db_path, to_table_name, total_list) print('end save')
def save_train_data(error_df_list, ac_df_list, db_path, table_name, transform_fn): create_table(db_path, table_name) def trans(error_df): res = [transform_fn(row) for index, row in error_df.iterrows()] return res error_items_list = [trans(error_df) for error_df in error_df_list] for error_items in error_items_list: insert_items(db_path, table_name, error_items)
def filter_program_id_main(db_path, table_name, new_table_name): df = read_experiment_result_df(db_path, table_name) grouped = df.groupby('id') print('group length: ', len(grouped)) save_list = [] for name, group in grouped: one = select_best_records(group) save_list += [one] print('save list length: ', len(save_list)) create_table(db_path, DATA_RECORDS_DEEPFIX, replace_table_name=new_table_name) run_sql_statment(db_path, DATA_RECORDS_DEEPFIX, 'insert_ignore', save_list, replace_table_name=new_table_name)
def preprocess(): # initLogging() preprocess_logger.info("Start Read Code Data") code_df = read_distinct_problem_user_ac_c_records_filter_error_code() preprocess_logger.info("Code Data Read Finish. Total: {}".format( code_df.shape[0])) que_read = mp.Queue() que_write = mp.Queue() create_table(db_full_path=FAKE_C_COMPILE_ERROR_DATA_DBPATH, table_name=current_table_name) pros = [] for i in range(6): pro = mp.Process(target=make_fake_code, args=(que_read, que_write, i)) pro.start() pros.append(pro) save_pro = mp.Process(target=save_fake_code, args=(que_write, code_df.shape[0])) save_pro.start() count = 0 ids = [] items = [] for index, row in code_df.iterrows(): count += 1 item = {'try_count': 0} item['id'] = row['id'] item['submit_url'] = row['submit_url'] item['problem_id'] = row['problem_id'] item['user_id'] = row['user_id'] item['problem_user_id'] = row['problem_user_id'] item['originalcode'] = row['code'].replace('\ufeff', '').replace('\u3000', ' ') items.append(item) ids.append(item['problem_user_id']) if len(ids) == 10000: push_code_to_queue(que_read, ids, items) preprocess_logger.info('Total Preprocess {}'.format(count)) ids = [] items = [] push_code_to_queue(que_read, ids, items) preprocess_logger.info('Total Preprocess {}'.format(count)) for p in pros: p.join() save_pro.join()
def save_fake_code(que: mp.Queue, all_data_count): create_table(db_full_path=FAKE_C_COMPILE_ERROR_DATA_DBPATH, table_name=current_table_name) que.qsize() preprocess_logger.info( 'Start Save Fake Code Process. all data count: {}'.format( all_data_count)) count = 0 error_count = 0 param = [] while True: if not que.empty() and count < all_data_count: try: preprocess_logger.info('before get item: {}'.format(count)) item = que.get() preprocess_logger.info('after get item: {}'.format(count)) except TypeError as e: preprocess_logger.info('Save get Type Error') error_count += 1 continue count += 1 if count % 1000 == 0: preprocess_logger.info( 'Total receive records: {}'.format(count)) if not item: continue param.append(item) preprocess_logger.info( 'save data count: {}. current count: {}, Wait item: {}, Que is Empty: {}' .format(count, len(param), que.qsize(), que.empty())) if len(param) > 1000: preprocess_logger.info( 'Save {} recode. Total record: {}. error count: {}. Wait item: {}' .format(len(param), count, error_count, que.qsize())) insert_items(db_full_path=FAKE_C_COMPILE_ERROR_DATA_DBPATH, table_name=current_table_name, params=dict_to_list(param)) param = [] elif que.empty() and count >= all_data_count: break elif que.qsize() <= 0: time.sleep(1) preprocess_logger.info( 'Save {} recode. Total record: {}. error count: {}. Wait item: {}'. format(len(param), count, error_count, que.qsize())) insert_items(db_full_path=FAKE_C_COMPILE_ERROR_DATA_DBPATH, table_name=current_table_name, params=dict_to_list(param)) preprocess_logger.info('End Save Fake Code Process')
def __init__(self, vocabulary, db_path, table_name, replace_table_name, ignore_token=None, end_id=None): self.vocabulary = vocabulary self.db_path = db_path self.table_name = table_name self.replace_table_name = replace_table_name create_table(self.db_path, self.table_name, self.replace_table_name) self.ignore_token = ignore_token self.end_id = end_id self.total_count = 0
def resave_python_code_main(): conn = sqlite3.connect(scrapyOJ_path) print('start read sql') df = pd.read_sql('select * from {} where language="Python 3"'.format('submit'), conn) print('total df length: {}'.format(len(df))) df = df[df['code'].map(lambda x: x != '')] print('no empty df length: {}'.format(len(df))) df_dict = df.to_dict(orient='list') del df print('finish filter') create_table(python_db_path, PYTHON_SUBMIT_TABLE) header_list = ['id', 'submit_url', 'submit_time', 'user_id', 'user_name', 'problem_id', 'problem_url', 'problem_name', 'problem_full_name', 'language', 'status', 'error_test_id', 'time', 'memory', 'code'] total_list = [df_dict[key] for key in header_list] total_list = list(zip(*total_list)) print('start save') insert_items(python_db_path, PYTHON_SUBMIT_TABLE, total_list) print('end save')
def preprocess(): # initLogging() preprocess_logger.info("Start Read Code Data") code_df = read_deepfix_ac_data() preprocess_logger.info("Code Data Read Finish. Total: {}".format( code_df.shape[0])) que_read = mp.Queue() que_write = mp.Queue() create_table(db_full_path=db_name, table_name=current_table_name) pros = [] for i in range(6): pro = mp.Process(target=make_fake_code, args=(que_read, que_write, i)) pro.start() pros.append(pro) save_pro = mp.Process(target=save_fake_code, args=(que_write, code_df.shape[0])) save_pro.start() count = 0 ids = [] items = [] for index, row in code_df.iterrows(): count += 1 # item = create_codeforce_item(row) item = create_deepfix_item(row) items.append(item) ids.append(item['problem_user_id']) if len(ids) == 10000: push_code_to_queue(que_read, ids, items) preprocess_logger.info('Total Preprocess {}'.format(count)) ids = [] items = [] push_code_to_queue(que_read, ids, items) preprocess_logger.info('Total Preprocess {}'.format(count)) for p in pros: p.join() save_pro.join()
def sample_and_save(model, dataset, batch_size, loss_function, parse_input_batch_data_fn, parse_target_batch_data_fn, do_sample=False, print_output=False, create_output_ids_fn=None, evaluate_obj_list=[], expand_output_and_target_fn=None, add_data_record_fn=None, db_path='', table_name=''): # total_loss = to_cuda(torch.Tensor([0])) total_batch = to_cuda(torch.Tensor([0])) saved_count = 0 steps = 1 for o in evaluate_obj_list: o.clear_result() model.eval() total_saved_list = [] with tqdm(total=len(dataset)) as pbar: with torch.no_grad(): for batch_data in data_loader(dataset, batch_size=batch_size, drop_last=True): model.zero_grad() # model_input = parse_input_batch_data(batch_data) model_input = parse_input_batch_data_fn(batch_data, do_sample=do_sample) # model_output = model.forward(*model_input, test=do_sample) if do_sample: model_output = model.forward(*model_input, do_sample=True) model_target = parse_target_batch_data_fn(batch_data) model_output, model_target = expand_output_and_target_fn( model_output, model_target) else: model_output = model.forward(*model_input) model_target = parse_target_batch_data_fn(batch_data) # loss = loss_function(*model_output, *model_target) output_ids = create_output_ids_fn(model_output, model_input) # total_loss += loss.data total_batch += batch_size # step_output = 'in evaluate step {} loss: {}, '.format(steps, loss.data.item()) step_output = 'in evaluate step {} '.format(steps) for evaluator in evaluate_obj_list: res = evaluator.add_result(output_ids, model_output, model_target, model_input, batch_data=batch_data) step_output += res # print(step_output) info(step_output) saved_list = add_data_record_fn(output_ids, model_output, batch_data) total_saved_list += saved_list if steps % 100 == 0: create_table(db_path, table_name) insert_items(db_path, table_name, total_saved_list) saved_count += len(total_saved_list) print('saved {} record in total {}. '.format( saved_count, total_batch.item())) total_saved_list = [] if print_output and steps % 100 == 0: pass # output_ids = output_ids.tolist() # target_ids = batch_data['ac_tokens'] # is_copy = (is_copy > 0.5).tolist() # target_is_copy = target_is_copy.tolist() # value_output = torch.squeeze(torch.topk(F.softmax(value_output, dim=-1), k=1, dim=-1)[1], dim=-1) # value_output = value_output.tolist() # target_ac_tokens = target_ac_tokens.tolist() # pointer_output = torch.squeeze(torch.topk(F.softmax(pointer_output, dim=-1), k=1, dim=-1)[1], dim=-1) # pointer_output = pointer_output.tolist() # target_pointer_output = target_pointer_output.tolist() # target_length = torch.sum(output_mask, dim=-1) # target_length = target_length.tolist() # for out, tar, cop, tar_cop, val, tar_val, poi, tar_poi, tar_len in zip(output_ids, target_ids, is_copy, # target_is_copy, value_output, # target_ac_tokens, # pointer_output, # target_pointer_output, target_length): # # for out, tar, in zip(output_ids, target_ids): # out_code, end_pos = convert_one_token_ids_to_code(out, id_to_word_fn=vocab.id_to_word, start=start_id, # end=end_id, unk=unk_id) # tar_code, tar_end_pos = convert_one_token_ids_to_code(tar[1:], id_to_word_fn=vocab.id_to_word, start=start_id, # end=end_id, unk=unk_id) # info('-------------- step {} ------------------------'.format(steps)) # info('output: {}'.format(out_code)) # info('target: {}'.format(tar_code)) # cop = [str(c) for c in cop] # tar_cop = [str(int(c)) for c in tar_cop] # poi = [str(c) for c in poi] # tar_poi = [str(c) for c in tar_poi] # info('copy output: {}'.format(' '.join(cop[:tar_len]))) # info('copy target: {}'.format(' '.join(tar_cop[:tar_len]))) # info('pointer output: {}'.format(' '.join(poi[:tar_len]))) # info('pointer target: {}'.format(' '.join(tar_poi[:tar_len]))) # # value_list = [] # target_list = [] # for c, v, t in zip(tar_cop, val, tar_val): # if c == '1': # value_list += ['<COPY>'] # target_list += ['<COPY>'] # else: # value_list += [vocab.id_to_word(int(v))] # target_list += [vocab.id_to_word(int(t))] # info('value output: {}'.format(' '.join(value_list[:tar_len]))) # info('value target: {}'.format(' '.join(target_list[:tar_len]))) steps += 1 pbar.update(batch_size) create_table(db_path, table_name) insert_items(db_path, table_name, total_saved_list) saved_count += len(total_saved_list) print('saved {} record in total {}. '.format(saved_count, total_batch.item())) return evaluate_obj_list
def multi_step_evaluate(model, dataset, batch_size, parse_input_batch_data_fn, parse_target_batch_data_fn, do_sample=False, print_output=False, create_output_ids_fn=None, evaluate_obj_list=[], expand_output_and_target_fn=None, max_step_times=0, vocabulary=None, file_path='', create_multi_step_next_input_batch_fn=None, extract_includes_fn=lambda x: x['includes'], print_output_fn=None, do_beam_search=False, target_file_path='main.out', log_file_path='main.log', do_save_data=False, max_save_distance=None, save_records_to_database=False, db_path='', table_name='', change_output_records_to_batch_fn=None, create_save_database_records_fn=None, error_stop_type='normal'): total_loss = to_cuda(torch.Tensor([0])) total_batch = to_cuda(torch.Tensor([0])) steps = 0 compile_evaluator = CompileResultEvaluate() compile_evaluator.clear_result() for o in evaluate_obj_list: o.clear_result() model.eval() from common.pycparser_util import tokenize_by_clex_fn tokenize_fn = tokenize_by_clex_fn() save_data_dict = {} save_records_list = [] # file_path = add_pid_to_file_path(file_path) # target_file_path = add_pid_to_file_path(target_file_path) with tqdm(total=len(dataset)) as pbar: with torch.no_grad(): for batch_data in data_loader(dataset, batch_size=batch_size, drop_last=False): model.zero_grad() input_data = batch_data.copy() final_output_list = [] output_records_list = [] continue_list = [True for _ in range(batch_size)] result_list = [False for _ in range(batch_size)] result_records_list = [] sample_steps = [-1 for _ in range(batch_size)] error_count_list = batch_data['error_count'] for i in range(max_step_times): model_input = parse_input_batch_data_fn(input_data, do_sample=True) model_output = model.forward(*model_input, do_sample=True, do_beam_search=do_beam_search) input_data, final_output, output_records, final_output_name_list, continue_list = create_multi_step_next_input_batch_fn( input_data, model_input, model_output, continue_list, do_beam_search) final_output_list += [final_output] output_records_list += [output_records] continue_list, result_list, cur_error_count_list = compile_code_ids_list( final_output_name_list, continue_list, result_list, vocabulary=vocabulary, includes_list=extract_includes_fn(input_data), file_path=file_path, target_file_path=target_file_path, log_file_path=log_file_path, do_compile_pool=True, need_transform=False) if error_stop_type == 'oracle': reject_list = [ True if c and n > o else False for c, o, n in zip(continue_list, error_count_list, cur_error_count_list) ] elif error_stop_type == 'normal': reject_list = [False for _ in range(batch_size)] error_count_list = [ n if n < o and n >= 0 else o for o, n in zip(error_count_list, cur_error_count_list) ] for i_f, rej in enumerate(reject_list): if rej: # use last output final_output_name_list[i_f] = input_data[ 'last_input_seq_name'][i_f] continue_list[i_f] = False sample_steps = [ i + 1 if s == -1 and not c and not r else s for s, c, r in zip(sample_steps, continue_list, reject_list) ] sample_steps = [ i if s == -1 and not c and r else s for s, c, r in zip( sample_steps, continue_list, reject_list) ] result_records_list += [result_list] if sum(continue_list) == 0: break sample_steps = [ max_step_times if s == -1 else s for s in sample_steps ] if do_save_data: batch_data['input_seq_name'] = batch_data[ 'final_output_name'] save_res_dict = save_addition_data( original_states=batch_data, states=input_data, tokenize_fn=tokenize_fn, batch_size=batch_size, file_path=file_path, target_file_path=target_file_path, vocabulary=vocabulary, max_distande=max_save_distance, only_error=True) for k, v in save_res_dict.items(): save_data_dict[k] = save_data_dict.get(k, []) + v if save_records_to_database: batch_output_records = change_output_records_to_batch_fn( output_records_list, sample_steps) records_list = create_save_database_records_fn( batch_data, sample_steps, final_output_name_list, result_list, batch_output_records, input_data) save_records_list += records_list step_output = 'in evaluate step {}: '.format(steps) res = compile_evaluator.add_result(result_list) step_output += res for evaluator in evaluate_obj_list: # customer evaluator interface res = evaluator.add_result(result_list, batch_data=batch_data) step_output += res # print(step_output) info(step_output) if print_output and steps % 1 == 0: print_output_fn(output_records=output_records_list, final_output=final_output_list, batch_data=batch_data, step_i=steps, vocabulary=vocabulary, compile_result_list=result_records_list) steps += 1 pbar.update(batch_size) evaluate_obj_list = [compile_evaluator] + evaluate_obj_list if save_records_to_database: create_table(db_path, DATA_RECORDS_DEEPFIX, replace_table_name=table_name) run_sql_statment(db_path, DATA_RECORDS_DEEPFIX, 'insert_ignore', save_records_list, replace_table_name=table_name) if steps == 0: t_loss = 0 else: t_loss = (total_loss / steps).item() return evaluate_obj_list, t_loss, save_data_dict