def encode_test_files(): paths = [ os.path.join(cleaned_tags_dir(), '*', '10-k', '*', '*', '*.unescaped') ] saved_filenames_path = os.path.join(cleaned_tags_dir(), 'validation_test_split') tokens_path = os.path.join(cleaned_tags_dir(), '*', 'tokens') encode_all_html_tables(FILETYPE_TESTING, paths, saved_filenames_path, tokens_path)
def find_test_encodings(): paths = [ os.path.join(cleaned_tags_dir(), '*', '10-k', '*', '*', '*.unescaped') ] saved_filenames_path = os.path.join(cleaned_tags_dir(), 'validation_test_split') remove_all_tokens_files() remove_all_number_files() tokens_path = os.path.join(cleaned_tags_dir(), '*', 'tokens') find_all_encodings(FILETYPE_TESTING, paths, saved_filenames_path, tokens_path)
def remove_all_tokens_files(): # You want to process all of these files at once # to ensure that the set of tokens takes all # files into consideration. This is why we # make sure that all token files are removed # before starting the process. print('Removing all token files ...', end=' ') remove_files(cleaned_tags_dir(), '**', 'tokens')
def test_matching_filenames(training): paths = [ os.path.join(cleaned_tags_dir(), '*', '10-k', '*', '*', '*.unescaped') ] saved_filenames_path = os.path.join(cleaned_tags_dir(), 'validation_test_split') if int(training) == FILETYPE_TRAINING: print('Training test') training_filenames = matching_filenames(saved_filenames_path, paths, FILETYPE_TRAINING) print(f'len(training_filenames): {len(training_filenames)}') else: print('Validation/testing test') validation_filenames = matching_filenames(saved_filenames_path, paths, FILETYPE_VALIDATION) test_filenames = matching_filenames(saved_filenames_path, paths, FILETYPE_TESTING) if len(set(validation_filenames) & set(test_filenames)) != 0: print(f'Error !! Some filenames in validation also in test.') print(f'len(validation_filenames): {len(validation_filenames)}') print(f'len(test_filenames): {len(test_filenames)}') num_validation_files = len(validation_filenames) num_test_files = len(test_filenames) total_num_files = num_validation_files + num_test_files validation_file_percent = num_validation_files / total_num_files * 100. test_file_percent = num_test_files / total_num_files * 100. if abs(validation_file_percent - VALIDATION_FILE_PERCENT) < 0.1 and \ abs(test_file_percent - TEST_FILE_PERCENT) < 0.1: print(f'Correct validation/test ratio of files selected') else: print(f'Error !! Incorrect validation/test ratio ' f'of files selected') print( 'validation_file_percent: {:4.1f}'.format(validation_file_percent)) print('test_file_percent: {:4.1f}'.format(test_file_percent))
def remove_all_number_files(): remove_files(cleaned_tags_dir(), '**', '*.nums')
def remove_all_decoded_files(): remove_files(cleaned_tags_dir(), '**', '*.decoded')
def decode_validation_test_files(): paths = os.path.join(cleaned_tags_dir(), '*', '10-k', '*', '*', '*.encoded') tokens_path = os.path.join(cleaned_tags_dir(), 'tokens') decode_all_files(get_filenames(paths), tokens_path)