示例#1
0
def encode_test_files():
    paths = [
        os.path.join(cleaned_tags_dir(), '*', '10-k', '*', '*', '*.unescaped')
    ]
    saved_filenames_path = os.path.join(cleaned_tags_dir(),
                                        'validation_test_split')
    tokens_path = os.path.join(cleaned_tags_dir(), '*', 'tokens')
    encode_all_html_tables(FILETYPE_TESTING, paths, saved_filenames_path,
                           tokens_path)
示例#2
0
def find_test_encodings():
    paths = [
        os.path.join(cleaned_tags_dir(), '*', '10-k', '*', '*', '*.unescaped')
    ]
    saved_filenames_path = os.path.join(cleaned_tags_dir(),
                                        'validation_test_split')
    remove_all_tokens_files()
    remove_all_number_files()

    tokens_path = os.path.join(cleaned_tags_dir(), '*', 'tokens')
    find_all_encodings(FILETYPE_TESTING, paths, saved_filenames_path,
                       tokens_path)
示例#3
0
def remove_all_tokens_files():
    # You want to process all of these files at once
    # to ensure that the set of tokens takes all
    # files into consideration. This is why we
    # make sure that all token files are removed
    # before starting the process.
    print('Removing all token files ...', end=' ')
    remove_files(cleaned_tags_dir(), '**', 'tokens')
def test_matching_filenames(training):
    paths = [
        os.path.join(cleaned_tags_dir(), '*', '10-k', '*', '*', '*.unescaped')
    ]
    saved_filenames_path = os.path.join(cleaned_tags_dir(),
                                        'validation_test_split')
    if int(training) == FILETYPE_TRAINING:
        print('Training test')
        training_filenames = matching_filenames(saved_filenames_path, paths,
                                                FILETYPE_TRAINING)
        print(f'len(training_filenames): {len(training_filenames)}')
    else:
        print('Validation/testing test')
        validation_filenames = matching_filenames(saved_filenames_path, paths,
                                                  FILETYPE_VALIDATION)
        test_filenames = matching_filenames(saved_filenames_path, paths,
                                            FILETYPE_TESTING)

        if len(set(validation_filenames) & set(test_filenames)) != 0:
            print(f'Error !! Some filenames in validation also in test.')

        print(f'len(validation_filenames): {len(validation_filenames)}')
        print(f'len(test_filenames): {len(test_filenames)}')

        num_validation_files = len(validation_filenames)
        num_test_files = len(test_filenames)
        total_num_files = num_validation_files + num_test_files

        validation_file_percent = num_validation_files / total_num_files * 100.
        test_file_percent = num_test_files / total_num_files * 100.
        if abs(validation_file_percent - VALIDATION_FILE_PERCENT) < 0.1 and \
           abs(test_file_percent - TEST_FILE_PERCENT) < 0.1:
            print(f'Correct validation/test ratio of files selected')
        else:
            print(f'Error !! Incorrect validation/test ratio '
                  f'of files selected')
        print(
            'validation_file_percent: {:4.1f}'.format(validation_file_percent))
        print('test_file_percent: {:4.1f}'.format(test_file_percent))
示例#5
0
def remove_all_number_files():
    remove_files(cleaned_tags_dir(), '**', '*.nums')
示例#6
0
def remove_all_decoded_files():
    remove_files(cleaned_tags_dir(), '**', '*.decoded')
示例#7
0
def decode_validation_test_files():
    paths = os.path.join(cleaned_tags_dir(),
                         '*', '10-k', '*', '*', '*.encoded')
    tokens_path = os.path.join(cleaned_tags_dir(), 'tokens')
    decode_all_files(get_filenames(paths), tokens_path)