def copy_distance_and_action_main(): db_path = r'/home/lf/Project/ProgramFix/data/slk_sample_data.db' con = sqlite3.connect(db_path) sample_train = read_data(con, 'slk_sample_common_c_error_records_train') sample_valid = read_data(con, 'slk_sample_common_c_error_records_valid') sample_test = read_data(con, 'slk_sample_common_c_error_records_test') print(len(sample_train)) print(len(sample_valid)) print(len(sample_test)) train, vaild, test = read_fake_common_c_error_dataset_with_limit_length(MAX_TOKEN_LENGTH) # train = train.sample(100) # vaild = vaild.sample(100) # test = test.sample(100) merge_train = pd.merge(sample_train, train, on=['id']) merge_valid = pd.merge(sample_valid, vaild, on=['id']) merge_test = pd.merge(sample_test, test, on=['id']) print(len(merge_train)) print(len(merge_valid)) print(len(merge_test)) update_column(merge_train, 'slk_sample_common_c_error_records_train', con) update_column(merge_valid, 'slk_sample_common_c_error_records_valid', con) update_column(merge_test, 'slk_sample_common_c_error_records_test', con)
def read_modify_action_token(): train_df, _, _ = read_fake_common_c_error_dataset_with_limit_length(500) train_df['modify_action_list'] = train_df['modify_action_list'].map( json.loads) extract_to_token_fn = lambda actions: [act['to_char'] for act in actions] act_tokens = [ extract_to_token_fn(actions) for actions in train_df['modify_action_list'] ] return act_tokens
def read_filter_without_include_ac_token(): train_df, _, _ = read_fake_common_c_error_dataset_with_limit_length(500) transform_lextoken_to_token_fn = lambda token_list: [ i.value for i in token_list ] tokenize_fn = tokenize_by_clex_fn() parse_tokens = [ transform_lextoken_to_token_fn(tokenize_fn(code)) for code in train_df['similar_code'] ] return parse_tokens
def load_common_error_data_sample_with_encoder_copy_100( inner_begin_id, inner_end_id): vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'], end_tokens=['<END>'], unk_token='<UNK>', addition_tokens=['<GAP>']) train, vaild, test = read_fake_common_c_error_dataset_with_limit_length( MAX_TOKEN_LENGTH) train = convert_c_code_fields_to_cpp_fields(train) vaild = convert_c_code_fields_to_cpp_fields(vaild) test = convert_c_code_fields_to_cpp_fields(test) train = train.sample(100) vaild = vaild.sample(100) test = test.sample(100) tokenize_fn = tokenize_by_clex_fn() parse_param = [ vocab, action_list_sorted, tokenize_fn, inner_begin_id, inner_end_id ] parse_test_param = [vocab, tokenize_fn] train_data = parse_error_tokens_and_action_map_encoder_copy( train, 'train', *parse_param) vaild_data = parse_error_tokens_and_action_map_encoder_copy( vaild, 'valid', *parse_param) test_data = parse_error_tokens_and_action_map_encoder_copy( test, 'test', *parse_param) # vaild_data = parse_test_tokens(vaild, 'valid', *parse_test_param) # test_data = parse_test_tokens(test, 'test', *parse_test_param) train = train.loc[train_data[0].index.values] vaild = vaild.loc[vaild_data[0].index.values] test = test.loc[test_data[0].index.values] train_dict = { 'error_code_word_id': train_data[0], 'ac_code_word_id': train_data[1], 'token_map': train_data[2], 'error_mask': train_data[3], 'includes': train['includes'], 'is_copy': train_data[4], 'distance': train_data[5], 'ac_code_target_id': train_data[6], 'ac_code_target': train_data[7] } valid_dict = { 'error_code_word_id': vaild_data[0], 'ac_code_word_id': vaild_data[1], 'token_map': vaild_data[2], 'error_mask': vaild_data[3], 'includes': vaild['includes'], 'is_copy': vaild_data[4], 'distance': vaild_data[5], 'ac_code_target_id': vaild_data[6], 'ac_code_target': vaild_data[7] } test_dict = { 'error_code_word_id': test_data[0], 'ac_code_word_id': test_data[1], 'token_map': test_data[2], 'error_mask': test_data[3], 'includes': test['includes'], 'is_copy': test_data[4], 'distance': test_data[5], 'ac_code_target_id': test_data[6], 'ac_code_target': test_data[7] } # valid_dict = {'error_code_word_id': vaild_data, 'includes': vaild['includes']} # test_dict = {'error_code_word_id': test_data, 'includes': test['includes']} # train_data_set = CCodeErrorDataSet(pd.DataFrame(train_dict), vocab, 'train') # valid_data_set = CCodeErrorDataSet(pd.DataFrame(valid_dict), vocab, 'all_valid') # test_data_set = CCodeErrorDataSet(pd.DataFrame(test_dict), vocab, 'all_test') # print(train_data[0]) return train_dict, valid_dict, test_dict
def load_common_error_dataset_iterate_error_data_100(do_flatten=False, merge_action=True, sequence_output=False): vocab = create_common_error_vocabulary( begin_tokens=['<BEGIN>', '<INNER_BEGIN>'], end_tokens=['<END>', '<INNER_END>'], unk_token='<UNK>', addition_tokens=['<PAD>']) train, valid, test = read_fake_common_c_error_dataset_with_limit_length( 500) train = convert_c_code_fields_to_cpp_fields(train, convert_include=True) valid = convert_c_code_fields_to_cpp_fields(valid, convert_include=True) test = convert_c_code_fields_to_cpp_fields(test, convert_include=True) train = train.sample(100) valid = valid.sample(100) test = test.sample(100) tokenize_fn = tokenize_by_clex_fn() parse_fn = parse_iterative_sample_action_error_code parse_param = [ vocab, action_list_sorted_no_reverse, tokenize_fn, merge_action, sequence_output ] train_data = parse_fn(train, 'train', *parse_param) valid_data = parse_fn(valid, 'valid', *parse_param) test_data = parse_fn(test, 'test', *parse_param) train = train.loc[train_data[0].index.values] valid = valid.loc[valid_data[0].index.values] test = test.loc[test_data[0].index.values] train_dict = { 'error_token_id_list': train_data[0], 'sample_error_id_list': train_data[1], 'sample_ac_id_list': train_data[2], 'ac_pos_list': train_data[3], 'error_pos_list': train_data[4], 'includes': train['includes'], 'distance': train['distance'], 'ac_code_ids': train_data[5], 'is_copy_list': train_data[6], 'copy_pos_list': train_data[7], 'sample_mask_list': train_data[8], 'error_token_name_list': train_data[9], 'id': train['id'], 'target_ac_token_id_list': train_data[10], 'ac_code_name_with_labels': train_data[11] } valid_dict = { 'error_token_id_list': valid_data[0], 'sample_error_id_list': valid_data[1], 'sample_ac_id_list': valid_data[2], 'ac_pos_list': valid_data[3], 'error_pos_list': valid_data[4], 'includes': valid['includes'], 'distance': valid['distance'], 'ac_code_ids': valid_data[5], 'is_copy_list': valid_data[6], 'copy_pos_list': valid_data[7], 'sample_mask_list': valid_data[8], 'error_token_name_list': valid_data[9], 'id': valid['id'], 'target_ac_token_id_list': valid_data[10], 'ac_code_name_with_labels': valid_data[11] } test_dict = { 'error_token_id_list': test_data[0], 'sample_error_id_list': test_data[1], 'sample_ac_id_list': test_data[2], 'ac_pos_list': test_data[3], 'error_pos_list': test_data[4], 'includes': test['includes'], 'distance': test['distance'], 'ac_code_ids': test_data[5], 'is_copy_list': test_data[6], 'copy_pos_list': test_data[7], 'sample_mask_list': test_data[8], 'error_token_name_list': test_data[9], 'id': test['id'], 'target_ac_token_id_list': test_data[10], 'ac_code_name_with_labels': test_data[11] } if do_flatten: train_dict = flatten_iterative_data(train_dict) valid_dict = flatten_iterative_data(valid_dict) test_dict = flatten_iterative_data(test_dict) return train_dict, valid_dict, test_dict
def load_common_error_data(addition_infomation=False, data_type=None): if data_type == 'deepfix': vocab = create_deepfix_common_error_vocabulary( begin_tokens=['<BEGIN>', '<INNER_BEGIN>'], end_tokens=['<END>', '<INNER_END>'], unk_token='<UNK>', addition_tokens=['<PAD>']) train, vaild, test = read_fake_common_deepfix_error_dataset_with_limit_length( 500) train = convert_c_code_fields_to_cpp_fields(train, convert_include=False) vaild = convert_c_code_fields_to_cpp_fields(vaild, convert_include=False) test = convert_c_code_fields_to_cpp_fields(test, convert_include=False) else: vocab = create_common_error_vocabulary(begin_tokens=['<BEGIN>'], end_tokens=['<END>'], unk_token='<UNK>', addition_tokens=['<GAP>']) train, vaild, test = read_fake_common_c_error_dataset_with_limit_length( MAX_TOKEN_LENGTH) train = convert_c_code_fields_to_cpp_fields(train) vaild = convert_c_code_fields_to_cpp_fields(vaild) test = convert_c_code_fields_to_cpp_fields(test) tokenize_fn = tokenize_by_clex_fn() parse_param = [vocab, action_list_sorted, tokenize_fn] parse_test_param = [vocab, tokenize_fn] train_data = parse_error_tokens_and_action_map(train, 'train', *parse_param) vaild_data = parse_error_tokens_and_action_map(vaild, 'valid', *parse_param) test_data = parse_error_tokens_and_action_map(test, 'test', *parse_param) # vaild_data = parse_test_tokens(vaild, 'valid', *parse_test_param) # test_data = parse_test_tokens(test, 'test', *parse_test_param) train = train.loc[train_data[0].index.values] vaild = vaild.loc[vaild_data[0].index.values] test = test.loc[test_data[0].index.values] train_dict = { 'error_code_word_id': train_data[0], 'ac_code_word_id': train_data[1], 'token_map': train_data[2], 'error_mask': train_data[3], 'includes': train['includes'], 'is_copy': train_data[4], 'pointer_map': train_data[5], 'distance': train_data[6], 'error_code_word': train_data[7] } valid_dict = { 'error_code_word_id': vaild_data[0], 'ac_code_word_id': vaild_data[1], 'token_map': vaild_data[2], 'error_mask': vaild_data[3], 'includes': vaild['includes'], 'is_copy': vaild_data[4], 'pointer_map': vaild_data[5], 'distance': vaild_data[6], 'error_code_word': vaild_data[7] } test_dict = { 'error_code_word_id': test_data[0], 'ac_code_word_id': test_data[1], 'token_map': test_data[2], 'error_mask': test_data[3], 'includes': test['includes'], 'is_copy': test_data[4], 'pointer_map': test_data[5], 'distance': test_data[6], 'error_code_word': test_data[7] } if addition_infomation: train_dict = add_c_common_code_original_info(data_dict=train_dict, df=train) valid_dict = add_c_common_code_original_info(data_dict=valid_dict, df=vaild) test_dict = add_c_common_code_original_info(data_dict=test_dict, df=test) # valid_dict = {'error_code_word_id': vaild_data, 'includes': vaild['includes']} # test_dict = {'error_code_word_id': test_data, 'includes': test['includes']} # train_data_set = CCodeErrorDataSet(pd.DataFrame(train_dict), vocab, 'train') # valid_data_set = CCodeErrorDataSet(pd.DataFrame(valid_dict), vocab, 'all_valid') # test_data_set = CCodeErrorDataSet(pd.DataFrame(test_dict), vocab, 'all_test') return train_dict, valid_dict, test_dict