def unescape_all_tables(search_path): for filename in get_filenames(search_path): print(f'Un-escaping file: {filename}') parts = filename.split('.') out_filename = '.'.join(parts[:-1]) + '.unescaped' converted = html.unescape(read_file(filename)) write_file(out_filename, converted)
def aggregate_sw_csvs(direc, output_name): csv_files = read_files_in_direc(direc, file_ext='.csv') X = [] y = [] prev_speaker = '' for f in tqdm(csv_files): with open(f, 'r') as readfile: csv_reader = csv.reader(readfile, delimiter=',') for idx, row in enumerate(csv_reader): if (idx == 0): continue X_ele = filter_pos_words(row[9]) if len(X_ele) <= 0: continue X_start_with_I = '1' if X_ele[0].startswith("I/") else '0' X_len = str(len(X_ele.split(' '))) if idx > 1 and row[5] == prev_speaker: X_speaker = '1' else: X_speaker = '0' prev_speaker = row[5] X.append(' '.join([X_ele, X_start_with_I, X_len, X_speaker])) y.append(convert_sw_to_y(row[4])) X.append('') y.append('') write_file(X, output_name + '_X.csv') write_file(y, output_name + '_y.csv')
def write_dict(X, write_fn): pos_words = [] for line in X: for pos in line: if pos not in pos_words: pos_words.append(pos) write_file(pos_words, write_fn)
def file_flush(update_type): if update_type == SINGLE_FILE: write_file(os.path.join(output_path, 'tokenized'), '\n'.join(update_tokens)) else: write_file( os.path.join(output_path, 'separate_files', 'file_list'), '\n'.join(separate_files))
def generate_all(self, f_name, f_contents): """ Given a function f_name : item -> string and f_contents : item -> bytestring, calls them for each item to compute the output file name, the output file contents, and to write it out. """ for x in self.all(): write_file(self.output_filename(f_name(x)), (f_contents(x))) return self
def render_all_templates(self, **kwargs): """Render all items as templates.""" for x in self.all(): debug('render all templates: {}'.format(x)) template = self.env.get_template(x) # Here we pass 2 extra parameters to the template: # - name : the loading name of the template (e.g. base file name) # - filename : the filepath of the template in the filesystem (if any) html = template.render(name=template.name, filename=template.filename, **kwargs) write_file(self.output_filename(x), html.encode(encoding='utf-8', errors='strict')) return self
def remove_single_parens(search_path): for filename in get_filenames(search_path): # filename = '/Volumes/datadrive/generated-html-json/0001035713_providian_financial_corp__10-k__2004-01-01_2004-12-31_10-k__tables-extracted_split-tables__24.table-extracted' print(f'Removing single parens from file: {filename}') parts = filename.split('.') out_filename = '.'.join(parts[:-1]) + '.remove-single-parens' top_tag = handle_single_parens(read_file(filename)) write_file(out_filename, str(top_tag))
def on_train_end(self, logs={}): data = { "losses": self.losses, "accuracies": self.accs, "val_losses": self.val_losses, "val_accuracies": self.val_accs, "val_f1s": self.val_f1s, "val_recalls": self.val_recalls, "val_precisions": self.val_precisions } write_file(data, f"../results/{model_name}-results.json", is_json=True)
def clean_all_tables(input_paths): for filename in get_filenames(input_paths): prefix = filename.split(os.sep)[-1].split('.')[0] out_filename = os.path.join(generated_html_json_dir(), prefix + '.cleaned') print(f'filename: {filename}') table_tag = BeautifulSoup(read_file(filename), 'html.parser') remove_tags(table_tag) out_dirname_parts = out_filename.split(os.sep)[:-1] ensure_dir_exists(os.path.join(os.sep, *out_dirname_parts)) write_file(out_filename, table_tag.prettify())
def reset_next_build_numb(output): next_build_number = output + "/nextBuildNumber" index = 1 data = "%d" % (index + 1) if not exists_path(next_build_number): make_nod(next_build_number) else: index = int(read_file(next_build_number)["data"]) data = "%d" % (index + 1) write_file(next_build_number, data) out = output + "/%d" % index if not exists_path(output): mk_dirs(output) return (out, index)
def decode_file(filename, tokens): with open(filename, 'r') as f: numbers = list(map(int, f.read().split())) result = [] idx = 0 while(idx < len(numbers)): num = numbers[idx] if num == Number.PADDING.value: idx += 1 continue if num == (Number.START_SEQUENCE.value): try: is_negative, num, is_fraction, is_percent = \ numbers[idx+1], numbers[idx+2], numbers[idx+3], \ numbers[idx+4] except IndexError: import pdb; pdb.set_trace() pass if is_negative: num = -num if is_fraction: num = format(convert_whole_to_fraction(num), '.2f') else: num = str(num) if is_percent: num += '%' result.append(num) idx += 6 # TODO: this should be in NumberSequence? continue result.append(tokens[str(num)]) idx += 1 fn_parts = filename.split(os.sep) fn_prefix_index = fn_parts[-1].rfind('.') fn_prefix = fn_parts[-1][:fn_prefix_index] dir_name = os.path.join(os.sep.join(filename.split(os.sep)[:-1]), 'decoded') create_dirs([dir_name]) out_filename = os.path.join(dir_name, fn_prefix + '.decoded') write_file(out_filename, ' '.join(result))
def reset_last_status(result, output, index): stats = result.statistics fail = stats.total.critical.failed last_fail = output + "/lastFail" last_passed = output + "/lastPassed" data = "%d" % index if fail != 0: if not exists_path(last_fail): make_nod(last_fail) write_file(last_fail, data) else: if not exists_path(last_passed): make_nod(last_passed) write_file(last_passed, data)
def __save(self, args): result = {"status": "success", "msg": "保存成功"} user_path = self.app.config["AUTO_HOME"] + "/workspace/%s%s" % (session["username"], args["path"]) if not write_file(user_path, args["data"]): result["status"] = "fail" result["msg"] = "保存失败" return result
def html_to_json(): output_dirname = os.path.join(generated_html_json_dir()) os.makedirs(output_dirname, exist_ok=True) result_string = '' num_all_files = 0 num_files_processed = 0 for full_filepath in get_filenames(html_samples_dir(), 'html_input', '*'): # full_filepath = './data/extract/samples/html/html_input/1.html' filename = full_filepath.split(os.sep)[-1].lower() if not filename.endswith('table-extracted'): continue print(f'{num_all_files}: full_filepath: {full_filepath}') result_string += full_filepath + '\n' num_all_files += 1 html_to_image(full_filepath) json_data, error_str = image_to_json('out.png') if json_data is None: result_string += traceback.format_exc() + '\n\n' else: num_files_processed += 1 output_filename = \ os.path.join(output_dirname, filename.split('.')[0] + '.json') print(f'output_filename: {output_filename}') write_json_to_file(output_filename, json_data) output_html_filename = os.path.join(output_dirname, filename) copy_file(full_filepath, output_html_filename) result_stats = f'num_files_processed: {num_files_processed}\n' \ f'num_all_files: {num_all_files}\n' \ f'success ratio: {num_files_processed / num_all_files}\n' print(result_stats) result_string += result_stats write_file(os.path.join(output_dirname, 'html_to_json_processing_results'), result_string)
def file_update(html_fn, html_tokens, json_fn, json_tokens, update_type=SINGLE_FILE): if update_type == SINGLE_FILE: update_tokens.append(html_fn + '^' + html_tokens + \ '^' + json_fn + '^' + json_tokens) else: # multiple files created - one for each set # of (html, json) input files update_tokens.append((html_fn, json_fn)) create_dirs(os.path.join(output_path, 'separate_files')) output_html_fn = os.path.join( output_path, 'separate_files', html_fn.split(os.sep)[-1] + '.tokenized') output_json_fn = os.path.join( output_path, 'separate_files', json_fn.split(os.sep)[-1] + '.tokenized') separate_files.append(output_html_fn + '^' + output_json_fn) write_file(output_html_fn, html_tokens) write_file(output_json_fn, json_tokens)
def reset_last_status(self, index): stats = self.result.statistics fail = stats.total.critical.failed lock = threading.Lock() lock.acquire() last_fail = self.output + "/lastFail" last_passed = self.output + "/lastPassed" data = "%d" % index if fail != 0: if not exists_path(last_fail): make_nod(last_fail) write_file(last_fail, data) else: if not exists_path(last_passed): make_nod(last_passed) write_file(last_passed, data) lock.release()
def generate_random_text(input_filenames, num_output_files): print('Getting set of all chars in data', end='') print(' ... done') for id in range(num_output_files): input_fn = np.random.choice(input_filenames) # input_fn = '/Volumes/datadrive/generated-html-json/0001035713_providian_financial_corp__10-k__2004-01-01_2004-12-31_10-k__tables-extracted_split-tables__24.unescaped' # To be done again as some of the numbers that should be empty are 9's, # even in the html page. print('{:6d}: file: {}'.format(id, input_fn)) fn_parts = input_fn.split(os.sep) fn_name = fn_parts[-1].split('.') fn_prefix, fn_type = fn_name[0], fn_name[1] json_input_fn = os.sep + os.path.join(*fn_parts[:-1], fn_prefix + '.json') json_generated_output_fn = os.path.join(generated_data_dir(), 'html', str(id) + '.' + fn_type) json_expected_output_fn = os.path.join(generated_data_dir(), 'expected_json', str(id) + '.expected_json') input_generated_fn = os.path.join(generated_data_dir(), 'input', str(id) + '.input') generated_input, json_expected = \ generate_input(input_fn, fn_type, json_input_fn) write_file(json_generated_output_fn, generated_input) write_json_to_file(json_expected_output_fn, json_expected) copy_file(input_fn, input_generated_fn)
def __save(self, args): result = {"status": "success", "msg": "成功:保存成功."} user_path = args["key"] if not write_file(user_path, args["data"]): result["status"] = "fail" result["msg"] = "失败:保存失败" if user_path.endswith('.robot'): self.app.config['DB'].refresh_caseinfo(user_path, 'force') self.app.config['DB'].insert_loginfo(session['username'], 'suite', 'edit', user_path, result['status']) if user_path.endswith( '.resource'): # delete keywords or update highlight update_resource(user_path) return result
def render(self, output_file, **kwargs): template = self.env.get_template(self.template_file) html = template.render(**kwargs) write_file(self.output_filename(output_file), html.encode(encoding='utf-8', errors='strict')) return self