def matching_filenames( saved_filenames_path, all_filename_paths, filename_type=0, selector_weights=[VALIDATION_FILE_PERCENT, TEST_FILE_PERCENT]): ''' selector_weights: For training, selector weights will be [100, 0]. This is so we can use all the files for training. Our training files are not the original ones - each will be generated. For validation/testing, we want selector weights to be [80, 20]. This means we will validate on 80% of our actual files, and test on 20%. ''' init_rng() # Initialize the random number generator. try: names = get_json_from_file(saved_filenames_path) # This will allow us to regenerate the filenames list # for the new filename type that is passed in. if not selectors_contain_filename_type(names['selectors'], filename_type): raise FileNotFoundError return select_filenames(names['filenames'], names['selectors'], filename_type) except FileNotFoundError: all_filenames = [] for paths in all_filename_paths: all_filenames.extend(get_filenames(paths)) # Some of our directories will have files which have been processed. # Ignore those files by filtering them out. all_filenames = [ fn for fn in all_filenames if fn.endswith(('html', 'json', 'expected_json', 'table-extracted', 'unescaped')) ] all_filenames.sort() if filename_type == FILETYPE_TRAINING: selectors = training_selectors(len(all_filenames)) else: selectors = validation_test_selectors(len(all_filenames), selector_weights) names = { 'filename_type': filename_type, 'filenames': all_filenames, 'selectors': selectors } write_json_to_file(saved_filenames_path, names) return select_filenames(names['filenames'], names['selectors'], filename_type)
def create_tokens(self): lengths = np.random.randint(self.MIN_DATA_SIZE, self.MAX_DATA_SIZE + 1, self.NUM_TOKENS) all_tokens = ['<sos>', '<pad>', '<eos>'] all_tokens.extend(self.special_tokens()) all_tokens.extend(self.html_structure_tokens()) all_tokens.extend(self.json_structure_tokens()) all_tokens.extend([ ''.join(np.random.choice(self.all_chars, length)) for length in lengths ]) all_tokens = [x.strip() for x in all_tokens] write_json_to_file(self.tokens_fn, all_tokens) return all_tokens
def html_to_json(): output_dirname = os.path.join(generated_html_json_dir()) os.makedirs(output_dirname, exist_ok=True) result_string = '' num_all_files = 0 num_files_processed = 0 for full_filepath in get_filenames(html_samples_dir(), 'html_input', '*'): # full_filepath = './data/extract/samples/html/html_input/1.html' filename = full_filepath.split(os.sep)[-1].lower() if not filename.endswith('table-extracted'): continue print(f'{num_all_files}: full_filepath: {full_filepath}') result_string += full_filepath + '\n' num_all_files += 1 html_to_image(full_filepath) json_data, error_str = image_to_json('out.png') if json_data is None: result_string += traceback.format_exc() + '\n\n' else: num_files_processed += 1 output_filename = \ os.path.join(output_dirname, filename.split('.')[0] + '.json') print(f'output_filename: {output_filename}') write_json_to_file(output_filename, json_data) output_html_filename = os.path.join(output_dirname, filename) copy_file(full_filepath, output_html_filename) result_stats = f'num_files_processed: {num_files_processed}\n' \ f'num_all_files: {num_all_files}\n' \ f'success ratio: {num_files_processed / num_all_files}\n' print(result_stats) result_string += result_stats write_file(os.path.join(output_dirname, 'html_to_json_processing_results'), result_string)
def generate_random_text(input_filenames, num_output_files): print('Getting set of all chars in data', end='') print(' ... done') for id in range(num_output_files): input_fn = np.random.choice(input_filenames) # input_fn = '/Volumes/datadrive/generated-html-json/0001035713_providian_financial_corp__10-k__2004-01-01_2004-12-31_10-k__tables-extracted_split-tables__24.unescaped' # To be done again as some of the numbers that should be empty are 9's, # even in the html page. print('{:6d}: file: {}'.format(id, input_fn)) fn_parts = input_fn.split(os.sep) fn_name = fn_parts[-1].split('.') fn_prefix, fn_type = fn_name[0], fn_name[1] json_input_fn = os.sep + os.path.join(*fn_parts[:-1], fn_prefix + '.json') json_generated_output_fn = os.path.join(generated_data_dir(), 'html', str(id) + '.' + fn_type) json_expected_output_fn = os.path.join(generated_data_dir(), 'expected_json', str(id) + '.expected_json') input_generated_fn = os.path.join(generated_data_dir(), 'input', str(id) + '.input') generated_input, json_expected = \ generate_input(input_fn, fn_type, json_input_fn) write_file(json_generated_output_fn, generated_input) write_json_to_file(json_expected_output_fn, json_expected) copy_file(input_fn, input_generated_fn)
def get_json_sequences(out_dirname, filename, json_text, write_number_dict=True): token_seq = [] word_num = Number.START_WORD_NUM.value number_dict = {} reverse_number_dict = {} matches = regex_words.findall(json_text) words = [] for match in matches: if len(match.strip()) == 0: continue if is_number(match): is_negative, num_seq, is_percent = get_number(match) if num_seq is not False: words.append( number_to_sequence(is_negative, num_seq, is_percent)) else: raise ValueError(f'match: {match} is not a number') else: words.append(match) word_num = update_seq_and_number_dict(words, token_seq, word_num, number_dict, reverse_number_dict) if write_number_dict is True: write_json_to_file( os.path.join(out_dirname, filename.split(os.sep)[-1] + '.nums'), convert_dict_values(number_dict)) return token_seq, number_dict
def get_html_sequences(out_dirname, filename, top_tag, write_number_dict=True): token_seq = [] word_num = Number.START_WORD_NUM.value number_dict = {} reverse_number_dict = {} def recurse(tag): nonlocal token_seq, word_num if isinstance(tag, NavigableString): words = [] # We need to split the tag first, because part of the tag # may have $1,009 for example. If we split using punctuation, # we will get two words with 1 and 9 (since we're converting # the 009 to a number). When we put it back we will get 19. # Instead, we split the tag using spaces first, then check # if it is a number (including characters $,.()%). We extract # that number (excluding $,()% characters) and write # it to our list for further procecssing. for word in tag.split(): # We want to store numbers that we find within the # cells of the tables with their negative sign, # and their % sign. We're going to output unsigned # integers, so we create known numbers to denote # - and % and start/end sequence numbers for our # number sequence. is_negative, num_seq, is_percent = get_number(word) if num_seq is not False: # We must append the tuple here. # If we extend, each value in the tuple will be # separately appended and we will lose the # tuple. words.append( number_to_sequence(is_negative, num_seq, is_percent)) else: for x in split_using_punctuation(word): words.append(x) word_num = update_seq_and_number_dict(words, token_seq, word_num, number_dict, reverse_number_dict) else: token_seq.append(tag.name.strip().lower()) attr_names_values = [] for name_or_value in get_attr_names_values(tag): for x in name_or_value.split(): attr_names_values.extend(split_using_punctuation(x)) word_num = update_seq_and_number_dict(attr_names_values, token_seq, word_num, number_dict, reverse_number_dict) for child in tag.children: recurse(child) token_seq.append('end_' + tag.name.strip().lower()) return word_num recurse(top_tag) if write_number_dict is True: write_json_to_file( os.path.join(out_dirname, filename.split(os.sep)[-1] + '.nums'), convert_dict_values(number_dict)) return token_seq, number_dict
def tokenize_training_set(): def update_max_token_len(html, json, max_len): html_len, json_len = len(html.split()), len(json.split()) return max(html_len, max(json_len, max_len)) input_path = generated_data_dir() output_path = tokenized_dir() create_dirs(output_path) if generate is True: input_fns = list( get_filenames( [os.path.join(generated_html_json_dir(), '*.unescaped')])) html_fns, json_fns = [], [] for id in range(NUMBER_OF_OUTPUTS): html_fn = np.random.choice(input_fns) fn_parts = html_fn.split(os.sep) fn_name = fn_parts[-1].split('.') fn_prefix, fn_type = fn_name[0], fn_name[1] json_fn = os.sep + os.path.join(*fn_parts[:-1], fn_prefix + '.json') html_fns.append(html_fn) json_fns.append(json_fn) combined_fns = zip(html_fns, json_fns) else: combined_fns = zip( list( get_filenames( [os.path.join(input_path, 'html', '*.unescaped')])), list( get_filenames([ os.path.join(input_path, 'expected_json', '*.expected_json') ]))) # print(f'combined_fns: {(list(combined_fns))[:2]}') update_tokens = [] separate_files = [] tokens = set() max_token_len = 0 def file_update(html_fn, html_tokens, json_fn, json_tokens, update_type=SINGLE_FILE): if update_type == SINGLE_FILE: update_tokens.append(html_fn + '^' + html_tokens + \ '^' + json_fn + '^' + json_tokens) else: # multiple files created - one for each set # of (html, json) input files update_tokens.append((html_fn, json_fn)) create_dirs(os.path.join(output_path, 'separate_files')) output_html_fn = os.path.join( output_path, 'separate_files', html_fn.split(os.sep)[-1] + '.tokenized') output_json_fn = os.path.join( output_path, 'separate_files', json_fn.split(os.sep)[-1] + '.tokenized') separate_files.append(output_html_fn + '^' + output_json_fn) write_file(output_html_fn, html_tokens) write_file(output_json_fn, json_tokens) def file_flush(update_type): if update_type == SINGLE_FILE: write_file(os.path.join(output_path, 'tokenized'), '\n'.join(update_tokens)) else: write_file( os.path.join(output_path, 'separate_files', 'file_list'), '\n'.join(separate_files)) for html_fn, json_fn in combined_fns: # html_fn = '/Volumes/Seagate/generated-data/html/0.unescaped' # json_fn = '/Volumes/Seagate/generated-data/expected_json/0.expected_json' print(f'html_fn: {html_fn}') print(f'json_fn: {json_fn}') html_tokens, json_tokens = tokenize_html_json(html_fn, json_fn, generate=generate) html_tokens = ' '.join(html_tokens).replace("'", "") json_tokens = ' '.join(json_tokens).replace("'", "") # Remove json string's quotes at the beginning and end json_tokens = json_tokens[2:len(json_tokens) - 2] max_token_len = update_max_token_len(html_tokens, json_tokens, max_token_len) tokens.update(html_tokens.split()) tokens.update(json_tokens.split()) file_update(html_fn, html_tokens, json_fn, json_tokens, update_type=UPDATE_TYPE) file_flush(update_type=UPDATE_TYPE) tokens = sorted(list(tokens)) tokens.reverse() tokens.extend(['<sos>', '<pad>', '<eos>']) tokens.reverse() write_json_to_file(os.path.join(output_path, 'tokens'), tokens) with open(os.path.join(output_path, 'max_token_len'), 'w') as f: f.write(f'max_token_len: {max_token_len}')
################################################### THEGAMEDATA_PATH = "thegamedata" ################################################### cred = credentials.Certificate('firebase/sacckey.json') default_app = firebase_admin.initialize_app(cred) db = firestore.client() print("firebase initialized", db) ################################################### create_dir("backup", verbose=True) print("retrieving doc") gamedatacoll = db.collection(THEGAMEDATA_PATH) thegamepgn_docref = gamedatacoll.document("pgn") thegamepgn_dict = thegamepgn_docref.get().to_dict() print("writing json") write_json_to_file("backup/pgn.json", thegamepgn_dict) print("setting backup in db") thegamepgn_backup_docref = gamedatacoll.document("backuppgn") thegamepgn_backup_docref.set(thegamepgn_dict) print("backup done") ###################################################