Exemplo n.º 1
0
    def predict_segment(self, corpus_segment):
        from data_utilities.util import get_mapping_after_normalization, convert_normalized_indices_to_unnormalized_indices

        norm_text = TextNormalizer.normalize_text(corpus_segment.language,
                                                  corpus_segment.text)
        mentions = []
        for match in re.finditer(self.named_entity_regex, norm_text):
            mentions += [
                Mention(match.start(1),
                        match.end(1),
                        match.group(1),
                        ref=corpus_segment.ref,
                        versionTitle=corpus_segment.versionTitle,
                        language=corpus_segment.language)
            ]
        mention_indices = [(mention.start, mention.end)
                           for mention in mentions]
        norm_map = get_mapping_after_normalization(
            corpus_segment.text, TextNormalizer.find_text_to_remove)
        mention_indices = convert_normalized_indices_to_unnormalized_indices(
            mention_indices, norm_map)
        for mention, (unnorm_start, unnorm_end) in zip(mentions,
                                                       mention_indices):
            mention.add_metadata(start=unnorm_start, end=unnorm_end)
        mentions = self.filter_already_found_mentions(mentions,
                                                      corpus_segment.text,
                                                      corpus_segment.language)
        return mentions
    def tag_index(self, index):
        from data_utilities.util import get_mapping_after_normalization, convert_normalized_indices_to_unnormalized_indices
        training = []
        mentions = []
        for seg in tqdm(index.all_segment_refs(), desc='Segs'):
            unnorm_text = seg.text('en').text
            norm_text = normalize_text('en', unnorm_text)

            entities = self.tag_segment(norm_text)
            ent_indices = [(ent[0], ent[1]) for ent in entities]
            norm_map = get_mapping_after_normalization(unnorm_text,
                                                       find_text_to_remove)
            ent_indices = convert_normalized_indices_to_unnormalized_indices(
                ent_indices, norm_map)
            for ent, unnorm_index in zip(entities, ent_indices):
                ent[0] = unnorm_index[0]
                ent[1] = unnorm_index[1]
            spacy_entities = [e[:3] for e in entities]
            self.check_for_missing_entities(unnorm_text, spacy_entities)
            for ent in entities:
                mentions += [{
                    "Book": index.title,
                    "Ref": seg.normal(),
                    "Bonayich ID": ent[4],
                    "Slug": ent[3],
                    "Start": ent[0],
                    "End": ent[1],
                    "Mention": unnorm_text[ent[0]:ent[1]]
                }]
            training += [[unnorm_text, {"entities": spacy_entities}]]
        return training, mentions
Exemplo n.º 3
0
 def test_reverse(self):
     text = "a###b##c"
     find_text_to_remove = lambda x: [(m, ' ') for m in re.finditer(r'#+', x)]
     norm_text = "a b c"
     rm = util.get_mapping_after_normalization(text, find_text_to_remove, reverse=True)
     assert rm == {1: 2, 5: 3}
     norm_s, norm_e = util.convert_normalized_indices_to_unnormalized_indices([(4, 8)], rm, reverse=True)[0]
     assert norm_text[norm_s:norm_e] == "b c"
Exemplo n.º 4
0
    def test_with_larger_substitution(self):
        text = "a###b##c"
        blah = "a----b----c"
        find_text_to_remove = lambda x: [(m, '----') for m in re.finditer(r'#+', x)]
        rm = util.get_mapping_after_normalization(text, find_text_to_remove)
        assert rm == {1: -1, 6: -3}

        norm_inds = (5, 11)
        unnorm_s, unnorm_e = util.convert_normalized_indices_to_unnormalized_indices([norm_inds], rm)[0]
        assert text[unnorm_s:unnorm_e] == "b##c"
Exemplo n.º 5
0
    def test_real_case(self):
        norm_mention = "Rabbi Yehuda"
        norm_regex = r"\s*<[^>]+>\s*"
        text = """The priest <b>would bring an earthenware</b> drinking <b>vessel [<i>peyalei</i>] and he would pour into it half a <i>log</i> of water from the basin</b> in the Temple. <b>Rabbi Yehuda says:</b> The priest would pour only <b>a quarter</b>-<i>log</i> of water. <b>Just as</b> Rabbi Yehuda <b>minimizes the writing,</b> as he requires"""
        norm_text = re.sub(norm_regex, ' ', text)
        find_text_to_remove = lambda x: [(m, ' ') for m in re.finditer(norm_regex, x)]
        rm = util.get_mapping_after_normalization(text, find_text_to_remove)
        norm_inds = (222, 234)
        assert norm_text[norm_inds[0]:norm_inds[1]] == norm_mention

        # include_trailing_chars == False
        unnorm_s, unnorm_e = util.convert_normalized_indices_to_unnormalized_indices([norm_inds], rm)[0]
        assert text[unnorm_s:unnorm_e] == norm_mention
        assert re.sub(norm_regex, '', text[unnorm_s:unnorm_e]) == norm_mention

        # prefix
        unnorm_s, unnorm_e = util.convert_normalized_indices_to_unnormalized_indices([(norm_inds[0], norm_inds[0])], rm)[0]
        assert text[unnorm_s:unnorm_e] == ""
Exemplo n.º 6
0
def test_get_mapping_after_normalization():
    text = "a###b##c"
    find_text_to_remove = lambda x: re.finditer(r"#+", x)
    rm = util.get_mapping_after_normalization(text, find_text_to_remove)
    assert rm == {1: 3, 2: 5}
Exemplo n.º 7
0
def create_html():
    tanakh_topics = {
        t.slug: t
        for t in Topic.init("biblical-figures").
        topics_by_link_type_recursively(only_leaves=True)
    }

    with open(
            "research/knowledge_graph/named_entity_recognition/tanakh_mentions.json",
            "r") as fin:
        mentions_by_ref = json.load(fin)
    text_by_ref = {}
    all_refs = []
    for index in library.get_indexes_in_category("Tanakh", full_records=True):
        he = Ref(index.title).text("he").text
        en = Ref(index.title).text(
            "en", vtitle="Tanakh: The Holy Scriptures, published by JPS").text
        all_refs += [r.normal() for r in index.all_segment_refs()]
        for iperek, perek in enumerate(he):
            for ipasuk, he_pasuk in enumerate(perek):
                ref = f"{index.title} {iperek+1}:{ipasuk+1}"
                en_pasuk = en[iperek][ipasuk]
                text_by_ref[ref] = {"he": he_pasuk, "en": en_pasuk}

    html = """
    <html>
        <head>
            <style>
                body {
                    width: 700px;
                    margin-right: auto;
                    margin-bottom: 50px;
                    margin-top: 50px;
                    margin-left: auto;
                }
                .he {
                    direction: rtl;
                }
                .missing {
                    color: red;
                }
                .found {
                    color: green;
                }
            </style>
        </head>
        <body>
    """
    he_mentions = []
    missing_html = html[:]
    he_found = 0
    en_found = 0
    missing_rows = 0

    for ref in all_refs:
        temp_mentions = []
        mentions = mentions_by_ref.get(ref, [])
        mentions.sort(key=lambda x: len(x['Form']), reverse=True)
        mentions = {m["Topics"]: m for m in mentions}.values()
        try:
            text = text_by_ref[ref]
        except KeyError:
            print("No", ref)
        he_text = unicodedata.normalize(
            "NFKC", re.sub('־', ' ', strip_cantillation(text['he'])))
        en_text = text['en']
        for mention in mentions:
            if len(mention["Form"]) == 0:
                continue
            topics = mention["Topics"].split()
            form = unicodedata.normalize("NFKC",
                                         re.sub('־', ' ', mention["Form"]))
            if form not in he_text:
                print("Missing form", form, ref)
                continue
            for he_match in re.finditer(
                    fr"(?:^|\s|׀|־|׃)({form})(?:$|\s|׀|־|׃)", he_text):
                temp_mentions += [{
                    "start": he_match.start(1),
                    "end": he_match.end(1),
                    "mention": form,
                    "id_matches": topics,
                    "ref": ref
                }]

            # disambiguated_topic = topics[0]  # TODO actually disambiguate
            # titles = sorted(tanakh_topics[disambiguated_topic].get_titles(lang='en', with_disambiguation=False), key=lambda x: len(x), reverse=True)
            # en_text = re.sub(fr'(?<!/)({"|".join(titles)})(?=[\s,.:;"\'’”()\[\]!?—\-<]|$)', fr'<a href="https://www.sefaria.org/topics/{disambiguated_topic}">\1</a>', en_text)
            # he_text = re.sub(fr'(?<!>)({mention["Form"]})', fr'<a href="https://www.sefaria.org/topics/{disambiguated_topic}">\1</a>', he_text)
        # he_found += he_text.count('<a ')
        # en_found += en_text.count('<a ')
        mention_indices = [(m["start"], m["end"]) for m in temp_mentions]
        norm_map = get_mapping_after_normalization(text['he'],
                                                   find_text_to_remove)
        mention_indices = convert_normalized_indices_to_unnormalized_indices(
            mention_indices, norm_map)
        for m, (unnorm_start, unnorm_end) in zip(temp_mentions,
                                                 mention_indices):
            m["start"] = unnorm_start
            m["end"] = unnorm_end
        he_mentions += temp_mentions
        he_html = add_html_links(temp_mentions, text['he'])
        row = f'''
        <p>{ref}</p>
        <p class="he">{he_html}</p>
        <p>{en_text}</p>
        '''
        if he_text.count('<a ') > en_text.count('<a '):
            missing_rows += 1
            missing_html += row
        html += row
    html += """
        </body>
    </html>
    """
    missing_html += """
        </body>
    </html>
    """
    print("HE", he_found)
    print("EN", en_found)
    print("MISSING", missing_rows)
    with open("research/knowledge_graph/named_entity_recognition/tanakh.html",
              "w") as fout:
        fout.write(html)
    with open(
            "research/knowledge_graph/named_entity_recognition/tanakh_missing.html",
            "w") as fout:
        fout.write(missing_html)
    with open(
            "research/knowledge_graph/named_entity_recognition/he_tanakh_mentions.json",
            "w") as fout:
        json.dump(he_mentions, fout, ensure_ascii=False, indent=2)
Exemplo n.º 8
0
 def test_with_substitution(self):
     text = "a###b##c"
     find_text_to_remove = lambda x: [(m, ' ') for m in re.finditer(r'#+', x)]
     rm = util.get_mapping_after_normalization(text, find_text_to_remove)
     assert rm == {1: 2, 3: 3}
Exemplo n.º 9
0
 def test_simple_case(self):
     text = "a###b##c"
     find_text_to_remove = lambda x: [(m, '') for m in re.finditer(r'#+', x)]
     rm = util.get_mapping_after_normalization(text, find_text_to_remove)
     assert rm == {1: 3, 2: 5}