def predict_segment(self, corpus_segment): from data_utilities.util import get_mapping_after_normalization, convert_normalized_indices_to_unnormalized_indices norm_text = TextNormalizer.normalize_text(corpus_segment.language, corpus_segment.text) mentions = [] for match in re.finditer(self.named_entity_regex, norm_text): mentions += [ Mention(match.start(1), match.end(1), match.group(1), ref=corpus_segment.ref, versionTitle=corpus_segment.versionTitle, language=corpus_segment.language) ] mention_indices = [(mention.start, mention.end) for mention in mentions] norm_map = get_mapping_after_normalization( corpus_segment.text, TextNormalizer.find_text_to_remove) mention_indices = convert_normalized_indices_to_unnormalized_indices( mention_indices, norm_map) for mention, (unnorm_start, unnorm_end) in zip(mentions, mention_indices): mention.add_metadata(start=unnorm_start, end=unnorm_end) mentions = self.filter_already_found_mentions(mentions, corpus_segment.text, corpus_segment.language) return mentions
def tag_index(self, index): from data_utilities.util import get_mapping_after_normalization, convert_normalized_indices_to_unnormalized_indices training = [] mentions = [] for seg in tqdm(index.all_segment_refs(), desc='Segs'): unnorm_text = seg.text('en').text norm_text = normalize_text('en', unnorm_text) entities = self.tag_segment(norm_text) ent_indices = [(ent[0], ent[1]) for ent in entities] norm_map = get_mapping_after_normalization(unnorm_text, find_text_to_remove) ent_indices = convert_normalized_indices_to_unnormalized_indices( ent_indices, norm_map) for ent, unnorm_index in zip(entities, ent_indices): ent[0] = unnorm_index[0] ent[1] = unnorm_index[1] spacy_entities = [e[:3] for e in entities] self.check_for_missing_entities(unnorm_text, spacy_entities) for ent in entities: mentions += [{ "Book": index.title, "Ref": seg.normal(), "Bonayich ID": ent[4], "Slug": ent[3], "Start": ent[0], "End": ent[1], "Mention": unnorm_text[ent[0]:ent[1]] }] training += [[unnorm_text, {"entities": spacy_entities}]] return training, mentions
def test_reverse(self): text = "a###b##c" find_text_to_remove = lambda x: [(m, ' ') for m in re.finditer(r'#+', x)] norm_text = "a b c" rm = util.get_mapping_after_normalization(text, find_text_to_remove, reverse=True) assert rm == {1: 2, 5: 3} norm_s, norm_e = util.convert_normalized_indices_to_unnormalized_indices([(4, 8)], rm, reverse=True)[0] assert norm_text[norm_s:norm_e] == "b c"
def test_with_larger_substitution(self): text = "a###b##c" blah = "a----b----c" find_text_to_remove = lambda x: [(m, '----') for m in re.finditer(r'#+', x)] rm = util.get_mapping_after_normalization(text, find_text_to_remove) assert rm == {1: -1, 6: -3} norm_inds = (5, 11) unnorm_s, unnorm_e = util.convert_normalized_indices_to_unnormalized_indices([norm_inds], rm)[0] assert text[unnorm_s:unnorm_e] == "b##c"
def test_real_case(self): norm_mention = "Rabbi Yehuda" norm_regex = r"\s*<[^>]+>\s*" text = """The priest <b>would bring an earthenware</b> drinking <b>vessel [<i>peyalei</i>] and he would pour into it half a <i>log</i> of water from the basin</b> in the Temple. <b>Rabbi Yehuda says:</b> The priest would pour only <b>a quarter</b>-<i>log</i> of water. <b>Just as</b> Rabbi Yehuda <b>minimizes the writing,</b> as he requires""" norm_text = re.sub(norm_regex, ' ', text) find_text_to_remove = lambda x: [(m, ' ') for m in re.finditer(norm_regex, x)] rm = util.get_mapping_after_normalization(text, find_text_to_remove) norm_inds = (222, 234) assert norm_text[norm_inds[0]:norm_inds[1]] == norm_mention # include_trailing_chars == False unnorm_s, unnorm_e = util.convert_normalized_indices_to_unnormalized_indices([norm_inds], rm)[0] assert text[unnorm_s:unnorm_e] == norm_mention assert re.sub(norm_regex, '', text[unnorm_s:unnorm_e]) == norm_mention # prefix unnorm_s, unnorm_e = util.convert_normalized_indices_to_unnormalized_indices([(norm_inds[0], norm_inds[0])], rm)[0] assert text[unnorm_s:unnorm_e] == ""
def test_get_mapping_after_normalization(): text = "a###b##c" find_text_to_remove = lambda x: re.finditer(r"#+", x) rm = util.get_mapping_after_normalization(text, find_text_to_remove) assert rm == {1: 3, 2: 5}
def create_html(): tanakh_topics = { t.slug: t for t in Topic.init("biblical-figures"). topics_by_link_type_recursively(only_leaves=True) } with open( "research/knowledge_graph/named_entity_recognition/tanakh_mentions.json", "r") as fin: mentions_by_ref = json.load(fin) text_by_ref = {} all_refs = [] for index in library.get_indexes_in_category("Tanakh", full_records=True): he = Ref(index.title).text("he").text en = Ref(index.title).text( "en", vtitle="Tanakh: The Holy Scriptures, published by JPS").text all_refs += [r.normal() for r in index.all_segment_refs()] for iperek, perek in enumerate(he): for ipasuk, he_pasuk in enumerate(perek): ref = f"{index.title} {iperek+1}:{ipasuk+1}" en_pasuk = en[iperek][ipasuk] text_by_ref[ref] = {"he": he_pasuk, "en": en_pasuk} html = """ <html> <head> <style> body { width: 700px; margin-right: auto; margin-bottom: 50px; margin-top: 50px; margin-left: auto; } .he { direction: rtl; } .missing { color: red; } .found { color: green; } </style> </head> <body> """ he_mentions = [] missing_html = html[:] he_found = 0 en_found = 0 missing_rows = 0 for ref in all_refs: temp_mentions = [] mentions = mentions_by_ref.get(ref, []) mentions.sort(key=lambda x: len(x['Form']), reverse=True) mentions = {m["Topics"]: m for m in mentions}.values() try: text = text_by_ref[ref] except KeyError: print("No", ref) he_text = unicodedata.normalize( "NFKC", re.sub('־', ' ', strip_cantillation(text['he']))) en_text = text['en'] for mention in mentions: if len(mention["Form"]) == 0: continue topics = mention["Topics"].split() form = unicodedata.normalize("NFKC", re.sub('־', ' ', mention["Form"])) if form not in he_text: print("Missing form", form, ref) continue for he_match in re.finditer( fr"(?:^|\s|׀|־|׃)({form})(?:$|\s|׀|־|׃)", he_text): temp_mentions += [{ "start": he_match.start(1), "end": he_match.end(1), "mention": form, "id_matches": topics, "ref": ref }] # disambiguated_topic = topics[0] # TODO actually disambiguate # titles = sorted(tanakh_topics[disambiguated_topic].get_titles(lang='en', with_disambiguation=False), key=lambda x: len(x), reverse=True) # en_text = re.sub(fr'(?<!/)({"|".join(titles)})(?=[\s,.:;"\'’”()\[\]!?—\-<]|$)', fr'<a href="https://www.sefaria.org/topics/{disambiguated_topic}">\1</a>', en_text) # he_text = re.sub(fr'(?<!>)({mention["Form"]})', fr'<a href="https://www.sefaria.org/topics/{disambiguated_topic}">\1</a>', he_text) # he_found += he_text.count('<a ') # en_found += en_text.count('<a ') mention_indices = [(m["start"], m["end"]) for m in temp_mentions] norm_map = get_mapping_after_normalization(text['he'], find_text_to_remove) mention_indices = convert_normalized_indices_to_unnormalized_indices( mention_indices, norm_map) for m, (unnorm_start, unnorm_end) in zip(temp_mentions, mention_indices): m["start"] = unnorm_start m["end"] = unnorm_end he_mentions += temp_mentions he_html = add_html_links(temp_mentions, text['he']) row = f''' <p>{ref}</p> <p class="he">{he_html}</p> <p>{en_text}</p> ''' if he_text.count('<a ') > en_text.count('<a '): missing_rows += 1 missing_html += row html += row html += """ </body> </html> """ missing_html += """ </body> </html> """ print("HE", he_found) print("EN", en_found) print("MISSING", missing_rows) with open("research/knowledge_graph/named_entity_recognition/tanakh.html", "w") as fout: fout.write(html) with open( "research/knowledge_graph/named_entity_recognition/tanakh_missing.html", "w") as fout: fout.write(missing_html) with open( "research/knowledge_graph/named_entity_recognition/he_tanakh_mentions.json", "w") as fout: json.dump(he_mentions, fout, ensure_ascii=False, indent=2)
def test_with_substitution(self): text = "a###b##c" find_text_to_remove = lambda x: [(m, ' ') for m in re.finditer(r'#+', x)] rm = util.get_mapping_after_normalization(text, find_text_to_remove) assert rm == {1: 2, 3: 3}
def test_simple_case(self): text = "a###b##c" find_text_to_remove = lambda x: [(m, '') for m in re.finditer(r'#+', x)] rm = util.get_mapping_after_normalization(text, find_text_to_remove) assert rm == {1: 3, 2: 5}