def get_preview_page(g_body_page, n_body_page, g_durchen_page, n_durchen_page): g_body_page_content = g_body_page.content n_body_page_content = n_body_page.content g_durchen_page_content = g_durchen_page.content n_durchen_page_content = n_durchen_page.content vol_num = g_body_page.vol n_body_page_content = transfer(g_body_page_content, [["pedurma", "(#)"]], n_body_page_content, output="txt") g_body_page_content = g_body_page_content.replace("#", "") body_result = reconstruct_body(n_body_page_content, g_body_page_content, vol_num) footnotes = reconstruct_footnote(n_durchen_page_content, g_durchen_page_content, vol_num) pg_num = get_page_num(body_result, vol_num) if pg_num not in footnotes: cur_pg_footnotes = [] raise PageNumMissing else: cur_pg_footnotes = footnotes[pg_num] if cur_pg_footnotes: merge_marker, merge = merge_footnotes_per_page(body_result, cur_pg_footnotes) return merge else: return ""
def reformat_text_with_note(original_text, text_with_note): annotations = [['line_break', '(\n)'], ['pagination', '(\[[0-9]+[a-z]{1}\])']] text_with_note = rm_annotations(text_with_note, ['\n', '\[[0-9]+[a-z]{1}\]']) original_text_with_note = transfer(original_text, annotations, text_with_note, output='txt') return original_text_with_note
def test_transfer_hfml_tags(): layer_1 = "<k1ཀཀཀཀ>\n ཁཁཁཁ" layer_2 = "ཀཀཀཀ\n <auཁཁཁཁ>" layer_3 = "ཀཀཀཀ\n ཁཁཁཁ\n <gགགགg>" base = "ཀཀཀ\n ཁཁཁ\n གགགག" expected = "<k1ཀཀཀ>\n <auཁཁཁ>\n <gགགགགg>" for layer in [layer_1, layer_2, layer_3]: base = transfer(layer, HFML_ANN_PATTERN, base, "txt") assert base == expected
def text_with_google_line_break(text, g_text): annotations = [['line_break', '(\n)'], ['pagination', '(\[[𰵀-]?[0-9]+[a-z]{1}\])']] g_annotations = [ '\n', '\[[𰵀-]?[0-9]+[a-z]{1}\]', '\[\w+\.\d+\]', '\{([𰵀-])?\w+\}' ] clean_text = rm_annotations(text, g_annotations) text_with_google_linebreak = transfer(g_text, annotations, clean_text, output='txt') return text_with_google_linebreak
def _merge_layers_for_vol(self, base_vol_fn): """Merge all the layers of a volume.""" base_layer = base_vol_fn.read_text() vol_fn = base_vol_fn.name for ann_layer_name in self.layers[1:]: ann_layer_vol_fn = self.layers_path / ann_layer_name / vol_fn if not ann_layer_vol_fn.is_file(): continue ann_layer = ann_layer_vol_fn.read_text() base_layer = transfer(ann_layer, HFML_ANN_PATTERN, base_layer, "txt") merged_layers_fn = self.merged_layers_path / vol_fn merged_layers_fn.write_text(base_layer.replace(">>", ">"))
def _update_pars(source, target): target = target.replace('\n\n', ' ') pattern = [["pars", "(\n\n)"]] updated = transfer(source, pattern, target, "txt") updated = re.sub(r'([!?”:;…,.»"]+?)([^ \f\v\u202f\u00a0\n!?”:;…,.»"])', r'\1 \2', updated) # reinserting spaces where needed updated = re.sub(r'\n\n/ +', '/\n\n', updated) updated = re.sub(r'/ /\n\n([^\n])', r'/\n\n/\1', updated) # updated = updated.replace(' /', '/') updated = re.sub(r'\n\n” ', '”\n\n', updated) updated = updated.replace('\n ', '\n') updated = updated.replace(' \n', '\n') updated = re.sub(r'([!?”:;…,.»"]+?) —', r'\1\n—', updated) updated = updated.replace('\n\n\n', '\n\n') return updated
def put_derge_line_break(preview_text, derge_text): collation_text = '' for vol_id, text in preview_text.items(): collation_text += re.sub('<p.+?>', '', text) full_derge_text = '' for vol_id, vol_text in derge_text.items(): full_derge_text += vol_text anns = [ r"\n", ] collation_text = rm_ann(collation_text, anns) collation_text_with_derge_linebr = transfer( full_derge_text, [["linebreak", r"(\n)"], ["pg_ann", r"(\[[𰵀-]?[0-9]+[a-z]{1}\])"]], collation_text, output="txt", ) return collation_text_with_derge_linebr
def get_derge_google_text(derge_hfml, google_hfml): derge_google_text = "" anns = [r"\n", r"\[\w+\.\d+\]", r"\[[𰵀-]?[0-9]+[a-z]{1}\]"] derge_hfml = rm_ann(derge_hfml, anns) dg_body = transfer( google_hfml, [["linebreak", r"(\n)"], ["pg_ann", r"(\[[𰵀-]?[0-9]+[a-z]{1}\])"]], derge_hfml, output="txt", ) dg_pages = get_pages(dg_body) g_pages = get_pages(google_hfml) for g_page, dg_page in zip(g_pages, dg_pages): if is_note_page(g_page, dg_page): derge_google_text += g_page else: derge_google_text += dg_page return derge_google_text
def reconstruct_footnote(namsel_footnote, google_footnote, vol_num): annotations = [ ["marker", "(<u.+?>)"], ["marker", "([①-⑩])"], ["pg_ref", "(<r.+?>)"], ["pedurma-page", "(<p.+?>)"], ] print("Calculating diffs..") diffs = transfer(namsel_footnote, annotations, google_footnote, output="diff") diffs_list = list(map(list, diffs)) filtered_diffs = filter_footnotes_diffs(diffs_list, vol_num) new_text = format_diff(filtered_diffs, vol_num, type_="footnotes") reformatted_footnotes = reformat_footnotes(new_text) formatted_footnotes = postprocess_footnotes(reformatted_footnotes, vol_num) return formatted_footnotes
def _update_pars(source, target): pattern = [["pars", "(\n\n\n)"]] updated = transfer(source, pattern, target, "txt") updated = updated.replace('\n\n\n\n', '\n\n\n') # hack for a strange behaviour return updated
def test_ann_transfer_optimized(): transfer(get_source(), get_pattern(), get_target(), "txt")
def test_ann_transfer_optimized(source_text, target_text, annotation_patterns, expected): annotated = transfer(source_text, annotation_patterns, target_text, "txt") assert annotated == expected
def flow(vol_path, source_path, target_path, text_type, image_info): """ - diff is computed between B and A text - footnotes and footnotes markers are filtered from diffs - they are applied to B text with markers - A image links are computed and added at the end of each page Args: B_path (path): path of text B (namsel) A_path (path): path of text A (clean) text_type (str): type of text can be either body or footnote image_info (list): Contains work_id, volume number and source image offset """ volume_no = image_info[1] namsel_text = source_path.read_text(encoding="utf-8") google_text = target_path.read_text(encoding="utf-8") diffs_to_yaml = partial( to_yaml, type_="diffs") # customising to_yaml function for diff list filtered_diffs_to_yaml = partial( to_yaml, type_="filtered_diffs" ) # customising to_yaml function for filtered diffs list footnotes_to_yaml = partial(to_yaml, type_="footnotes") dir_path = vol_path / text_type diffs_yaml_path = dir_path / "diffs.yaml" filtered_diffs_yaml_path = dir_path / "filtered_diffs.yaml" # Text_type can be either body of the text or footnote footnote. if text_type == "body": # patterns = [['google_marker','(#)'],["pages", "\[\d+[ab]\]"]] # transformed_namsel = transfer(google_text, patterns, namsel_text, output='txt') # namsel_text = transformed_namsel.replace('#་','་#') # google_text = google_text.replace('#','') print("Calculating diffs...") diffs = get_diffs(namsel_text, google_text) diffs_list = list(map(list, diffs)) diffs_to_yaml(diffs_list, dir_path) print("Filtering diffs...") filtered_diffs = filter_diffs(diffs_yaml_path, "body", image_info) #filtered_diffs = rm_diff_tag(filtered_diffs) filtered_diffs_to_yaml(filtered_diffs, dir_path) new_text = format_diff(filtered_diffs_yaml_path, image_info, type_="body") new_text = reformatting_body(new_text) (dir_path / f"result.txt").write_text(new_text, encoding="utf-8") elif text_type == "footnotes": annotations = [ ["marker", "(<m.+?>)"], ["marker", "([①-⑩])"], ["pg_ref", "(<r.+?>)"], ["pedurma_page", "(<p.+?>)"], ] google_text = rm_google_ocr_header(google_text) clean_google_text = preprocess_google_notes(google_text) clean_namsel_text = preprocess_namsel_notes(namsel_text) print("Calculating diffs..") diffs = transfer(clean_namsel_text, annotations, clean_google_text) diffs_list = list(map(list, diffs)) diffs_to_yaml(diffs_list, dir_path) filtered_diffs = filter_footnotes_diffs(diffs_yaml_path, image_info[1]) filtered_diffs_to_yaml(filtered_diffs, dir_path) new_text = format_diff(filtered_diffs_yaml_path, image_info, type_="footnotes") reformatted_footnotes = reformat_footnotes(new_text) formatted_yaml = postprocess_footnotes(reformatted_footnotes) footnotes_to_yaml(formatted_yaml, dir_path) (dir_path / "result.txt").write_text(reformatted_footnotes, encoding="utf-8") else: print("Type not found") print("Done")