def unescape_all_tables(search_path): for filename in get_filenames(search_path): print(f'Un-escaping file: {filename}') parts = filename.split('.') out_filename = '.'.join(parts[:-1]) + '.unescaped' converted = html.unescape(read_file(filename)) write_file(out_filename, converted)
def decode_training_files(): paths = [os.path.join(generated_data_dir(), 'html', 'encoded', '*.encoded'), os.path.join(generated_data_dir(), 'expected_json', 'encoded', '*.encoded')] tokens_path = os.path.join(generated_data_dir(), 'tokens') decode_all_files(get_filenames(paths), tokens_path)
def find_all_encodings(file_type, paths, saved_filenames_path, tokens_path): # filenames = matching_filenames(saved_filenames_path, # paths, # file_type) filenames = get_filenames(paths) print('Starting all encodings') base_dirname = os.sep.join(saved_filenames_path.split(os.sep)[:-1]) all_encodings(filenames, base_dirname, tokens_path)
def encode_all_html_tables(file_type, paths, saved_filenames_path, tokens_path): # Read tokens into a dictionary using value:token. # This allows us to write the token given each # value as we encode each file. tokens = read_tokens_file(tokens_path) tokens = flip_tokens_keys_values(tokens) # num_dirs_to_process = 3 # current_company_dir = '' out_dirname_json = \ os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]), 'expected_json', 'encoded') out_dirname_html = \ os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]), 'html', 'encoded') create_dirs([out_dirname_json, out_dirname_html]) max_encoded_file_token_len = 0 # filenames = matching_filenames(saved_filenames_path, # paths, # file_type) filenames = get_filenames(paths) for filename in filenames: # company_dir_idx = len(cleaned_tags_dir()) # company_dir = filename[company_dir_idx+1:].split(os.sep)[0] # if current_company_dir != company_dir: # current_company_dir = company_dir # num_dirs_to_process -= 1 # if num_dirs_to_process <= 0: # break # filename = '/Volumes/datadrive/tags-cleaned/0000707605_AMERISERV_FINANCIAL_INC__PA_/10-k/2018-01-01_2018-12-31_10-K/tables-extracted/162.table-extracted' print(f'filename: {filename}') file_data = read_file(filename) if filename.endswith('json'): token_len = encode_json(out_dirname_json, filename, file_data, tokens) else: token_len = encode_html_table(out_dirname_html, filename, file_data, tokens) max_encoded_file_token_len = max(max_encoded_file_token_len, token_len) with open(os.path.join(out_dirname_json, 'max_encoded_file_token_len'), 'w') as f: f.write(f'max_encoded_file_token_len={max_encoded_file_token_len}') with open(os.path.join(out_dirname_html, 'max_encoded_file_token_len'), 'w') as f: f.write(f'max_encoded_file_token_len={max_encoded_file_token_len}')
def remove_single_parens(search_path): for filename in get_filenames(search_path): # filename = '/Volumes/datadrive/generated-html-json/0001035713_providian_financial_corp__10-k__2004-01-01_2004-12-31_10-k__tables-extracted_split-tables__24.table-extracted' print(f'Removing single parens from file: {filename}') parts = filename.split('.') out_filename = '.'.join(parts[:-1]) + '.remove-single-parens' top_tag = handle_single_parens(read_file(filename)) write_file(out_filename, str(top_tag))
def train_set_max_token_len(): print('Getting filenames ...', end=' ') base_path = os.path.join(generated_data_dir()) fns = list(get_filenames([os.path.join(base_path, 'html', '*.unescaped')])) fns.extend(list(get_filenames([os.path.join(base_path, 'expected_json', '*.expected_json')]))) print('done') bar = ChargingBar('Processing files', max=len(fns)) max_token_len = 0 for fn in fns: token_len = len(read_file(fn).split()) if token_len > max_token_len: max_token_len = token_len bar.next() bar.finish() with open(os.path.join(base_path, 'max_token_len'), 'w') as f: f.write(f'max_token_len: {max_token_len}')
def check_hand_created_samples(): result = True for samples_dir, input_name in \ zip([text_samples_dir(), html_samples_dir()], ['text_input', 'html_input']): data_filenames = get_filenames(samples_dir, input_name, '*') json_filenames = get_filenames(samples_dir, 'json_input', '*') data_filenames = sorted(data_filenames) json_filenames = sorted(json_filenames) for d_fn, j_fn in zip(data_filenames, json_filenames): print(f'Checking:\n {d_fn}\n {j_fn}\n') input_data = read_file(d_fn) json_input_data = get_json_from_file(j_fn) if data_contains_all_elements(input_data, json_input_data) is False: print(f'Errors found in:\n input: {d_fn}\n' f' json_input: {j_fn}') result = False return result
def matching_filenames( saved_filenames_path, all_filename_paths, filename_type=0, selector_weights=[VALIDATION_FILE_PERCENT, TEST_FILE_PERCENT]): ''' selector_weights: For training, selector weights will be [100, 0]. This is so we can use all the files for training. Our training files are not the original ones - each will be generated. For validation/testing, we want selector weights to be [80, 20]. This means we will validate on 80% of our actual files, and test on 20%. ''' init_rng() # Initialize the random number generator. try: names = get_json_from_file(saved_filenames_path) # This will allow us to regenerate the filenames list # for the new filename type that is passed in. if not selectors_contain_filename_type(names['selectors'], filename_type): raise FileNotFoundError return select_filenames(names['filenames'], names['selectors'], filename_type) except FileNotFoundError: all_filenames = [] for paths in all_filename_paths: all_filenames.extend(get_filenames(paths)) # Some of our directories will have files which have been processed. # Ignore those files by filtering them out. all_filenames = [ fn for fn in all_filenames if fn.endswith(('html', 'json', 'expected_json', 'table-extracted', 'unescaped')) ] all_filenames.sort() if filename_type == FILETYPE_TRAINING: selectors = training_selectors(len(all_filenames)) else: selectors = validation_test_selectors(len(all_filenames), selector_weights) names = { 'filename_type': filename_type, 'filenames': all_filenames, 'selectors': selectors } write_json_to_file(saved_filenames_path, names) return select_filenames(names['filenames'], names['selectors'], filename_type)
def generate_samples(): create_dirs([os.path.join(generated_data_dir(), 'html'), os.path.join(generated_data_dir(), 'expected_json'), os.path.join(generated_data_dir(), 'input')]) data_filenames = [] for samples_dir in [generated_html_json_dir()]: sorted_files = sorted(list(get_filenames([os.path.join(samples_dir, '*')]))) sorted_files = list(filter(lambda x: x.endswith('unescaped'), sorted_files)) data_filenames.extend(sorted_files) generate_random_text(data_filenames, NUMBER_OF_OUTPUT_FILES)
def clean_all_tables(input_paths): for filename in get_filenames(input_paths): prefix = filename.split(os.sep)[-1].split('.')[0] out_filename = os.path.join(generated_html_json_dir(), prefix + '.cleaned') print(f'filename: {filename}') table_tag = BeautifulSoup(read_file(filename), 'html.parser') remove_tags(table_tag) out_dirname_parts = out_filename.split(os.sep)[:-1] ensure_dir_exists(os.path.join(os.sep, *out_dirname_parts)) write_file(out_filename, table_tag.prettify())
def test_set_max_token_len(): print('Getting filenames ...', end=' ') base_path = tables_extracted_split_tables_dir() fns = list( get_filenames( [os.path.join(base_path, '*', '10-k', '*', '*.table-extracted')])) print('done.') bar = ChargingBar('Processing files', max=len(fns)) max_token_len = 0 for fn in fns: token_len = len(read_file(fn).split()) if token_len > max_token_len: max_token_len = token_len bar.next() bar.finish() with open(os.path.join(base_path, 'max_token_len'), 'w') as f: f.write(f'max_token_len: {max_token_len}')
def find_unprocessed_tag_names(): unprocessed_tags = set() unprocessed_tags_exist = False for filename in get_filenames(extracted_tables_dir(), '*', '10-k', '*', '*', '*'): table_tag = BeautifulSoup(read_file(filename), 'html.parser') descendant_tag_names = find_descendant_tag_names(table_tag.descendants) diff = descendant_tag_names - set(tag_actions.keys()) unprocessed_tags.update(diff) if len(diff) > 0: unprocessed_tags_exist = True print(f'filename: {filename}') print(f'unprocessed_tags: {unprocessed_tags}') if unprocessed_tags_exist: print(f'unprocessed_tags: {unprocessed_tags}') else: print('No unprocessed tags found')
def html_to_json(): output_dirname = os.path.join(generated_html_json_dir()) os.makedirs(output_dirname, exist_ok=True) result_string = '' num_all_files = 0 num_files_processed = 0 for full_filepath in get_filenames(html_samples_dir(), 'html_input', '*'): # full_filepath = './data/extract/samples/html/html_input/1.html' filename = full_filepath.split(os.sep)[-1].lower() if not filename.endswith('table-extracted'): continue print(f'{num_all_files}: full_filepath: {full_filepath}') result_string += full_filepath + '\n' num_all_files += 1 html_to_image(full_filepath) json_data, error_str = image_to_json('out.png') if json_data is None: result_string += traceback.format_exc() + '\n\n' else: num_files_processed += 1 output_filename = \ os.path.join(output_dirname, filename.split('.')[0] + '.json') print(f'output_filename: {output_filename}') write_json_to_file(output_filename, json_data) output_html_filename = os.path.join(output_dirname, filename) copy_file(full_filepath, output_html_filename) result_stats = f'num_files_processed: {num_files_processed}\n' \ f'num_all_files: {num_all_files}\n' \ f'success ratio: {num_files_processed / num_all_files}\n' print(result_stats) result_string += result_stats write_file(os.path.join(output_dirname, 'html_to_json_processing_results'), result_string)
def tokenize_training_set(): def update_max_token_len(html, json, max_len): html_len, json_len = len(html.split()), len(json.split()) return max(html_len, max(json_len, max_len)) input_path = generated_data_dir() output_path = tokenized_dir() create_dirs(output_path) if generate is True: input_fns = list( get_filenames( [os.path.join(generated_html_json_dir(), '*.unescaped')])) html_fns, json_fns = [], [] for id in range(NUMBER_OF_OUTPUTS): html_fn = np.random.choice(input_fns) fn_parts = html_fn.split(os.sep) fn_name = fn_parts[-1].split('.') fn_prefix, fn_type = fn_name[0], fn_name[1] json_fn = os.sep + os.path.join(*fn_parts[:-1], fn_prefix + '.json') html_fns.append(html_fn) json_fns.append(json_fn) combined_fns = zip(html_fns, json_fns) else: combined_fns = zip( list( get_filenames( [os.path.join(input_path, 'html', '*.unescaped')])), list( get_filenames([ os.path.join(input_path, 'expected_json', '*.expected_json') ]))) # print(f'combined_fns: {(list(combined_fns))[:2]}') update_tokens = [] separate_files = [] tokens = set() max_token_len = 0 def file_update(html_fn, html_tokens, json_fn, json_tokens, update_type=SINGLE_FILE): if update_type == SINGLE_FILE: update_tokens.append(html_fn + '^' + html_tokens + \ '^' + json_fn + '^' + json_tokens) else: # multiple files created - one for each set # of (html, json) input files update_tokens.append((html_fn, json_fn)) create_dirs(os.path.join(output_path, 'separate_files')) output_html_fn = os.path.join( output_path, 'separate_files', html_fn.split(os.sep)[-1] + '.tokenized') output_json_fn = os.path.join( output_path, 'separate_files', json_fn.split(os.sep)[-1] + '.tokenized') separate_files.append(output_html_fn + '^' + output_json_fn) write_file(output_html_fn, html_tokens) write_file(output_json_fn, json_tokens) def file_flush(update_type): if update_type == SINGLE_FILE: write_file(os.path.join(output_path, 'tokenized'), '\n'.join(update_tokens)) else: write_file( os.path.join(output_path, 'separate_files', 'file_list'), '\n'.join(separate_files)) for html_fn, json_fn in combined_fns: # html_fn = '/Volumes/Seagate/generated-data/html/0.unescaped' # json_fn = '/Volumes/Seagate/generated-data/expected_json/0.expected_json' print(f'html_fn: {html_fn}') print(f'json_fn: {json_fn}') html_tokens, json_tokens = tokenize_html_json(html_fn, json_fn, generate=generate) html_tokens = ' '.join(html_tokens).replace("'", "") json_tokens = ' '.join(json_tokens).replace("'", "") # Remove json string's quotes at the beginning and end json_tokens = json_tokens[2:len(json_tokens) - 2] max_token_len = update_max_token_len(html_tokens, json_tokens, max_token_len) tokens.update(html_tokens.split()) tokens.update(json_tokens.split()) file_update(html_fn, html_tokens, json_fn, json_tokens, update_type=UPDATE_TYPE) file_flush(update_type=UPDATE_TYPE) tokens = sorted(list(tokens)) tokens.reverse() tokens.extend(['<sos>', '<pad>', '<eos>']) tokens.reverse() write_json_to_file(os.path.join(output_path, 'tokens'), tokens) with open(os.path.join(output_path, 'max_token_len'), 'w') as f: f.write(f'max_token_len: {max_token_len}')
def all_encodings(filenames, base_dirname, tokens_path): # Since we're writing tokens to a file for each company, # and later merging these tokens, the token number # must always keep incrementing. This way, our dictionary with # (token_num: token_value) will not miss any tokens. out_dirname_json = \ os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]), 'expected_json', 'encoded') out_dirname_html = \ os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]), 'html', 'encoded') create_dirs([out_dirname_json, out_dirname_html]) current_company_dir = '' token_num = Number.START_WORD_NUM.value tokens = set() tokens_filename = '' # num_dirs_to_process = 3 for filename in filenames: # filename = '/Volumes/datadrive/tags-cleaned/0000707605_AMERISERV_FINANCIAL_INC__PA_/10-k/2018-01-01_2018-12-31_10-K/tables-extracted/162.table-extracted' print(f'filename: {filename}') text = read_file(filename) company_dir_idx = len(base_dirname) if base_dirname == generated_data_dir(): company_dir = '' else: company_dir = filename[company_dir_idx + 1:].split(os.sep)[0] if current_company_dir != company_dir: if len(tokens) > 0: write_tokens_file(tokens, tokens_filename, token_num) token_num += len(tokens) del tokens tokens = set() current_company_dir = company_dir # num_dirs_to_process -= 1 # if num_dirs_to_process <= 0: # break else: # We have to create this variable, and assign to it. # This way, we have access to the last filename # in the else clause of this for statement. tokens_filename = get_tokens_filename(filename, company_dir_idx, company_dir, "tokens") if filename.endswith('unescaped') or filename.endswith('html') \ or filename.endswith('table-extracted'): find_html_table_encodings(out_dirname_html, filename, text, tokens) elif filename.endswith('json'): find_json_encodings(out_dirname_json, filename, text, tokens) else: write_tokens_file(tokens, tokens_filename, token_num) all_tokens_filename = os.path.join(base_dirname, 'tokens') all_tokens = set() for filename in get_filenames([tokens_path]): tokens = read_tokens_file(filename) all_tokens.update(get_token_values(tokens)) print(f'len(all_tokens): {len(all_tokens)}') # We need to give the offset as the last value in this function call. # This allows us to interpret the value of 1 as the start of a # number sequence, and not confuse it with an entry in the tokens # file that has key = 1. write_tokens_file(all_tokens, all_tokens_filename, Number.START_WORD_NUM.value)
def decode_validation_test_files(): paths = os.path.join(cleaned_tags_dir(), '*', '10-k', '*', '*', '*.encoded') tokens_path = os.path.join(cleaned_tags_dir(), 'tokens') decode_all_files(get_filenames(paths), tokens_path)