def get_rabbi_char_loc(context, seg_text):
    matches = match_text(seg_text.split(), [context.replace('~', '')],
                         with_num_abbrevs=False,
                         place_all=True,
                         place_consecutively=True,
                         verbose=False)
    if matches["matches"][0][0] == -1:
        return None, None
    matched = matches["match_text"][0][0]
    count = seg_text.count(matched)
    if count == 0:
        return None, None
    if count > 1:
        # print(f"Context\n{context}\nappears {count} times!")
        return None, None
    rabbi = context.split('~')[1]
    rabbi_len = len(rabbi)
    context_start = seg_text.find(matched)
    if matched != context.replace('~', ''):
        # cant assume rabbi_start_rel is same as it was in `context`
        rcount = matched.count(rabbi)
        if rcount == 0:
            print("NO_RABBI")
            return None, None
        if rcount > 1:
            print("TON_O_RABANAN")
            return None, None
        rabbi_start_rel = matched.find(rabbi)
    else:
        rabbi_start_rel = context.find('~')
    start = context_start + rabbi_start_rel
    end = start + rabbi_len
    return start, end
def get_rabbi_char_loc_list(context_list, seg_text, norm_regex=None, repl=None, **match_text_kwargs):
    orig_seg_text = seg_text
    if norm_regex is not None:
        seg_text = re.sub(norm_regex, repl, seg_text)
    matches = match_text(seg_text.split(), [context.replace('~', '') for context in context_list], with_num_abbrevs=False, place_all=True, place_consecutively=True, verbose=False, max_overlap_percent=1.1, **match_text_kwargs)
    rabbi_span_list = []
    for match_span, matched_text, context in zip(matches["matches"], matches["match_text"], context_list):
        rabbi_span_list += [get_rabbi_char_loc(match_span, matched_text, context, seg_text, orig_seg_text, norm_regex, repl)]
    return rabbi_span_list
示例#3
0
def create_section(oref, dicta_text, dicta_vtitle):
    with_nikkud, without_nukkud = dicta_text.split(), strip_nikkud(
        dicta_text).split()
    sefaria_text = prepare_sefaria_text(oref)

    dh_match = match_text(without_nukkud, sefaria_text)

    matches = dh_match['matches']
    segments = oref.all_segment_refs()
    assert len(segments) == len(matches)

    for segment, match in zip(segments, matches):
        tc = segment.text('he', dicta_vtitle)
        new_segment_text = u' '.join(with_nikkud[match[0]:match[1] + 1])
        if not new_segment_text:
            new_segment_text = segment.text('he', davidson_vtitle).text

        tc.text = new_segment_text
        tc.save()
def get_rabbi_char_loc(match_span, matched_text, context, seg_text, orig_seg_text, norm_regex, repl):
    from sefaria.helper.normalization import FunctionNormalizer
    from research.knowledge_graph.named_entity_recognition.ner_tagger import NormalizerTools
    if match_span[0] == -1:
        return None, None
    matched = matched_text[0]
    pre_rabbi, rabbi, _ = context.split('~')
    context_start = list(re.finditer(r'\S+', seg_text))[match_span[0]].start()
    if matched != context.replace('~', ''):
        # cant assume rabbi_start_rel is same as it was in `context`
        word_b4_rabbi = pre_rabbi.split()[-1] + ' ' if len(pre_rabbi.strip()) > 0 else ''
        rabbi_matches = match_text(matched.split(), [word_b4_rabbi + rabbi], with_num_abbrevs=False, place_all=True, place_consecutively=True, verbose=False)
        rabbi_matched = rabbi_matches["match_text"][0][0]
        if rabbi_matched == '':
            return None, None
        rcount = matched.count(rabbi_matched)
        if rcount > 1:
            print("TON_O_RABANAN")
            return None, None
        rabbi_start_rel = matched.find(rabbi_matched)
        if len(word_b4_rabbi) > 0 and rabbi_matched.startswith(word_b4_rabbi):
            # first word is not part of rabbi abbreviation (unlike א"ר where first word should be considered part of the rabbi)
            # wait until now to remove word_b4_rabbi to reduce the rabbi's ambiguity in matched
            rabbi_matched = rabbi_matched[len(word_b4_rabbi):]
            rabbi_start_rel += len(word_b4_rabbi)
        rabbi_len = len(rabbi_matched)
    else:
        rabbi_start_rel = context.find('~')
        rabbi_len = len(rabbi)
    start = context_start + rabbi_start_rel
    end = start + rabbi_len
    if norm_regex is not None:
        def find_text_to_remove(s, **kwargs):
            return [(m, repl) for m in re.finditer(norm_regex, s)]
        normalizer = FunctionNormalizer(find_text_to_remove)
        norm_map = normalizer.get_mapping_after_normalization(orig_seg_text)
        mention_indices = normalizer.convert_normalized_indices_to_unnormalized_indices([(start, end)], norm_map)
        start, end = NormalizerTools.include_trailing_nikkud(mention_indices[0], orig_seg_text)
    return start, end
示例#5
0
 def test_empty_comment(self):
     daftext = u'אע״ג שאמרו ככה בלה בלה בלה'.split()
     rashi = [u'', u'אף על גב שאמרו']
     matched = dhm.match_text(daftext, rashi, verbose=True)
示例#6
0
def match_cal_segments(mesechta):
    def tokenize_words(str):
        str = str.replace(u"־", " ")
        str = re.sub(r"</?.+>", "", str)  # get rid of html tags
        str = re.sub(r"\([^\(\)]+\)", "", str)  # get rid of refs
        #str = str.replace("'", '"')
        word_list = filter(bool, re.split(r"[\s\:\-\,\.\;\(\)\[\]\{\}]", str))
        return word_list

    def merge_cal_word_objs(s, e, word_obj_list):
        obj_list = word_obj_list[s:e]
        m_word = u" ".join([o["word"] for o in obj_list])
        m_head_word = u" ".join([o["head_word"] for o in obj_list])
        m_pos_list = [o["POS"] for o in obj_list]
        m_pos = max(set(m_pos_list), key=m_pos_list.count)
        new_obj = obj_list[0].copy()
        new_obj["word"] = m_word
        new_obj["head_word"] = m_head_word
        new_obj["POS"] = m_pos
        return [
            new_obj
        ]  #returns a single element array which will replace a range s:e in the original array

    cal_lines = json.load(open("cal_lines_{}.json".format(mesechta), "r"),
                          encoding="utf8")
    cal_pos_hashtable = json.load(open("cal_pos_hashtable.json", "r"),
                                  encoding='utf8')
    dafs = cal_lines["dafs"]
    lines_by_daf = cal_lines["lines"]

    super_base_ref = Ref(mesechta)
    subrefs = super_base_ref.all_subrefs()
    ical = 0

    num_sef_words = 0
    num_cal_words = 0
    num_words_matched = 0

    for curr_sef_ref in subrefs:
        if curr_sef_ref.is_empty(): continue
        if ical >= len(dafs): break

        daf = dafs[ical]
        print "-----{} DAF {}  ({}/{})-----".format(mesechta, daf, ical,
                                                    len(dafs))

        base_tc = TextChunk(curr_sef_ref, "he")
        bas_word_list = []  # re.split(r"\s+"," ".join(base_text.text))
        for segment in base_tc.text:
            bas_word_list += tokenize_words(segment)

        temp_out = [{"word": w, "class": "unknown"} for w in bas_word_list]

        lines = [[word_obj["word"] for word_obj in temp_line]
                 for temp_line in lines_by_daf[ical]]
        word_obj_list = [
            word_obj for temp_line in lines_by_daf[ical]
            for word_obj in temp_line
        ]
        lines_by_str = [u' '.join(line_array) for line_array in lines]

        curr_cal_ref = Ref("{} {}".format(mesechta, daf))

        out = []
        word_for_word_se = []
        cal_words = []
        missed_words = []

        global_offset = 0
        if curr_sef_ref == curr_cal_ref:
            matched = dibur_hamatchil_matcher.match_text(
                bas_word_list,
                lines_by_str,
                verbose=True,
                word_threshold=0.27,
                char_threshold=0.6,
                with_abbrev_matches=True,
                with_num_abbrevs=False)
            start_end_map = matched["matches"]
            abbrev_matches = matched["abbrevs"]
            abbrev_ranges = [[am.rashiRange for am in am_list]
                             for am_list in abbrev_matches]
            print u' --- '.join(
                [unicode(am) for am_list in abbrev_matches for am in am_list])
            abbrev_count = 0
            for ar in abbrev_ranges:
                abbrev_count += len(ar)
            #if abbrev_count > 0:
            #    print "GRESATLJL THNA DZEOR", abbrev_ranges
            for iline, se in enumerate(start_end_map):

                curr_cal_line = lines[iline]
                # if there is an expanded abbrev, concat those words into one element
                if len(abbrev_ranges[iline]) > 0:
                    offset = 0  # account for the fact that you're losing elements in the array as you merge them
                    abbrev_ranges[iline].sort(key=lambda x: x[0])
                    for ar in abbrev_ranges[iline]:
                        if ar[1] - ar[0] <= 0:
                            continue  #TODO there's an issue with the abbrev func, but i'm too lazy to fix now. sometimes they're zero length

                        #redefine ar by how many actual words are in the range, not just how many elements
                        start_ar = ar[0]
                        i_abbrev = start_ar
                        num_words = 0
                        while i_abbrev < len(curr_cal_line):
                            temp_w = curr_cal_line[i_abbrev]
                            num_words += len(re.split(ur'\s+', temp_w))
                            if num_words >= (ar[1] - ar[0] + 1):
                                break
                            i_abbrev += 1
                        end_ar = i_abbrev

                        ar = (start_ar, end_ar)
                        if len(curr_cal_line[ar[0] - offset:ar[1] + 1 - offset]
                               ) != len(
                                   word_obj_list[ar[0] - offset +
                                                 len(cal_words):ar[1] + 1 -
                                                 offset + len(cal_words)]):
                            #something's wrong. not sure what, but best to ignore this
                            continue
                        print u"ABBREV RANGE {} --- OFFSET {}".format(
                            ar, offset)
                        print u"CURR CAL LINE BEFORE {}".format(u','.join(
                            curr_cal_line[ar[0] - offset:ar[1] + 1 - offset]))
                        curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] = [
                            u' '.join(curr_cal_line[ar[0] - offset:ar[1] + 1 -
                                                    offset])
                        ]
                        print u"CURR CAL LINE AFTER {}".format(
                            curr_cal_line[ar[0] - offset])
                        print u"WORD OBJ LIST BEFORE {}".format(u','.join([
                            u'({})'.format(obj['word'])
                            for obj in merge_cal_word_objs(
                                ar[0] - offset + len(cal_words), ar[1] + 1 -
                                offset + len(cal_words), word_obj_list)
                        ]))
                        word_obj_list[ar[0] - offset + len(cal_words):ar[1] +
                                      1 - offset +
                                      len(cal_words)] = merge_cal_word_objs(
                                          ar[0] - offset + len(cal_words),
                                          ar[1] + 1 - offset + len(cal_words),
                                          word_obj_list)
                        print u"WORD OBJ LIST AFTER {}".format(
                            word_obj_list[ar[0] - offset +
                                          len(cal_words)]['word'])
                        offset += ar[1] - ar[0]
                        global_offset += offset

                cal_words += curr_cal_line
                if se[0] == -1:
                    word_for_word_se += [(-1, -1)
                                         for i in range(len(curr_cal_line))]
                    continue
                # matched_cal_objs_indexes = language_tools.match_segments_without_order(lines[iline],bas_word_list[se[0]:se[1]+1],2.0)
                curr_bas_line = bas_word_list[se[0]:se[1] + 1]
                #print u'base line',u' '.join(curr_bas_line)
                matched_obj_words_base = dibur_hamatchil_matcher.match_text(
                    curr_bas_line,
                    curr_cal_line,
                    char_threshold=0.35,
                    verbose=False,
                    with_num_abbrevs=False)
                matched_words_base = matched_obj_words_base["matches"]
                word_for_word_se += [(tse[0] + se[0],
                                      tse[1] + se[0]) if tse[0] != -1 else tse
                                     for tse in matched_words_base]

            matched_word_for_word_obj = dibur_hamatchil_matcher.match_text(
                bas_word_list,
                cal_words,
                char_threshold=0.35,
                prev_matched_results=word_for_word_se,
                boundaryFlexibility=2,
                with_num_abbrevs=False)
            matched_word_for_word = matched_word_for_word_obj["matches"]
            cal_len = len(matched_word_for_word)
            bad_word_offset = 0
            for ical_word, temp_se in enumerate(matched_word_for_word):
                if temp_se[0] == -1:
                    missed_words.append({
                        "word":
                        word_obj_list[ical_word]["word"],
                        "index":
                        ical_word
                    })
                    continue

                #dictionary juggling...
                for i in xrange(temp_se[0], temp_se[1] + 1):
                    #in case a cal_words and word_obj_list aren't the same length bc a word got split up
                    """
                    if cal_words[ical_word] != word_obj_list[ical_word-bad_word_offset]["word"]:
                        if ical_word+1 < len(cal_words) and cal_words[ical_word+1] != word_obj_list[ical_word-bad_word_offset+1]["word"]:
                            bad_word_offset += 1
                        continue
                    """
                    cal_word_obj = word_obj_list[ical_word].copy()
                    cal_word_obj["cal_word"] = cal_word_obj["word"]
                    temp_sef_word = temp_out[i]["word"]
                    temp_out[i] = cal_word_obj
                    temp_out[i]["class"] = "talmud"
                    temp_out[i]["word"] = temp_sef_word

            print u"\n-----\nFOUND {}/{} ({}%)".format(
                cal_len - len(missed_words), cal_len,
                (1 - round(1.0 * len(missed_words) / cal_len, 4)) * 100)
            #print u"MISSED: {}".format(u" ,".join([u"{}:{}".format(wo["word"], wo["index"]) for wo in missed_words]))
            ical += 1
            num_cal_words += cal_len
            num_words_matched += (cal_len - len(missed_words))
        """
        #tag 1 pos words if still untagged
        for iwo,word_obj in enumerate(temp_out):
            word = word_obj["word"]
            if word in cal_pos_hashtable:
                if len(cal_pos_hashtable[word]) == 1:
                    temp_out[iwo] = {"word":word,"cal_word":word,"class":"talmud","POS":cal_pos_hashtable[word][0]}
        """

        num_sef_words += len(temp_out)

        out += temp_out

        sef_daf = curr_sef_ref.__str__().replace("{} ".format(mesechta),
                                                 "").encode('utf8')
        doc = {"words": out, "missed_words": missed_words}
        fp = codecs.open(
            "cal_matcher_output/{}/lang_naive_talmud/lang_naive_talmud_{}.json"
            .format(mesechta, sef_daf),
            "w",
            encoding='utf-8')
        json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
        fp.close()

    return num_sef_words, num_cal_words, num_words_matched
示例#7
0
def match_cal_segments(mesechta):
    def tokenize_words(str):
        str = str.replace(u"־", " ")
        str = re.sub(r"</?.+>", "", str)  # get rid of html tags
        str = re.sub(r"\([^\(\)]+\)", "", str)  # get rid of refs
        str = str.replace("'", '"')
        word_list = filter(bool, re.split(r"[\s\:\-\,\.\;\(\)\[\]\{\}]", str))
        return word_list

    def merge_cal_word_objs(s,e,word_obj_list):
        obj_list = word_obj_list[s:e]
        m_word = u" ".join([o["word"] for o in obj_list])
        m_head_word = u" ".join([o["head_word"] for o in obj_list])
        m_pos_list = [o["POS"] for o in obj_list]
        m_pos = max(set(m_pos_list), key=m_pos_list.count)
        new_obj = obj_list[0].copy()
        new_obj["word"] = m_word
        new_obj["head_word"] = m_head_word
        new_obj["POS"] = m_pos
        return [new_obj] #returns a single element array which will replace a range s:e in the original array

    cal_lines = json.load(open("cal_lines_{}.json".format(mesechta), "r"), encoding="utf8")
    cal_pos_hashtable = json.load(open("cal_pos_hashtable.json","r"),encoding='utf8')
    dafs = cal_lines["dafs"]
    lines_by_daf = cal_lines["lines"]

    super_base_ref = Ref(mesechta)
    subrefs = super_base_ref.all_subrefs()
    ical = 0

    for curr_sef_ref in subrefs:
        if curr_sef_ref.is_empty(): continue
        if ical >= len(dafs): break


        daf = dafs[ical]
        print "-----{} DAF {}  ({}/{})-----".format(mesechta,daf,ical,len(dafs))


        base_tc = TextChunk(curr_sef_ref, "he")
        bas_word_list = []  # re.split(r"\s+"," ".join(base_text.text))
        for segment in base_tc.text:
            bas_word_list += tokenize_words(segment)

        temp_out = [{"word": w, "class": "unknown"} for w in bas_word_list]



        lines = [[word_obj["word"] for word_obj in temp_line] for temp_line in lines_by_daf[ical]]
        word_obj_list = [word_obj for temp_line in lines_by_daf[ical] for word_obj in temp_line]
        lines_by_str = [u' '.join(line_array) for line_array in lines]

        curr_cal_ref = Ref("{} {}".format(mesechta, daf))

        out = []
        word_for_word_se = []
        cal_words = []
        missed_words = []

        global_offset = 0
        if curr_sef_ref == curr_cal_ref:
            start_end_map,abbrev_matches = dibur_hamatchil_matcher.match_text(bas_word_list, lines_by_str, verbose=True, word_threshold=0.27, char_threshold=1.5,with_abbrev_matches=True)
            abbrev_ranges = [[am.rashiRange for am in am_list] for am_list in abbrev_matches ]
            print u' --- '.join([unicode(am) for am_list in abbrev_matches for am in am_list])
            abbrev_count = 0
            for ar in abbrev_ranges:
                abbrev_count += len(ar)
            #if abbrev_count > 0:
            #    print "GRESATLJL THNA DZEOR", abbrev_ranges
            for iline,se in enumerate(start_end_map):

                curr_cal_line = lines[iline]
                # if there is an expanded abbrev, concat those words into one element
                if len(abbrev_ranges[iline]) > 0:
                    offset = 0 # account for the fact that you're losing elements in the array as you merge them
                    abbrev_ranges[iline].sort(key=lambda x: x[0])
                    for ar in abbrev_ranges[iline]:
                        if ar[1] - ar[0] <= 0:
                            continue #TODO there's an issue with the abbrev func, but i'm too lazy to fix now. sometimes they're zero length

                        #redefine ar by how many actual words are in the range, not just how many elements
                        start_ar = ar[0]
                        i_abbrev = start_ar
                        num_words = 0
                        while i_abbrev < len(curr_cal_line):
                            temp_w = curr_cal_line[i_abbrev]
                            num_words += len(re.split(ur'\s+',temp_w))
                            if num_words >= (ar[1]-ar[0]+1):
                                break
                            i_abbrev += 1
                        end_ar = i_abbrev

                        ar = (start_ar,end_ar)
                        if len(curr_cal_line[ar[0]-offset:ar[1]+1-offset]) != len( word_obj_list[ar[0]-offset+len(cal_words):ar[1]+1-offset+len(cal_words)]):
                            #something's wrong. not sure what, but best to ignore this
                            continue
                        print u"ABBREV RANGE {} --- OFFSET {}".format(ar,offset)
                        print u"CURR CAL LINE BEFORE {}".format(u','.join(curr_cal_line[ar[0]-offset:ar[1]+1-offset]))
                        curr_cal_line[ar[0]-offset:ar[1]+1-offset] = [u' '.join(curr_cal_line[ar[0]-offset:ar[1]+1-offset])]
                        print u"CURR CAL LINE AFTER {}".format(curr_cal_line[ar[0]-offset])
                        print u"WORD OBJ LIST BEFORE {}".format(u','.join([u'({})'.format(obj['word']) for obj in merge_cal_word_objs(ar[0]-offset+len(cal_words),ar[1]+1-offset+len(cal_words),word_obj_list)]))
                        word_obj_list[ar[0]-offset+len(cal_words):ar[1]+1-offset+len(cal_words)] = merge_cal_word_objs(ar[0]-offset+len(cal_words),ar[1]+1-offset+len(cal_words),word_obj_list)
                        print u"WORD OBJ LIST AFTER {}".format(word_obj_list[ar[0]-offset+len(cal_words)]['word'])
                        offset += ar[1]-ar[0]
                        global_offset += offset

                cal_words += curr_cal_line
                if se[0] == -1:
                    word_for_word_se += [(-1,-1) for i in range(len(curr_cal_line))]
                    continue
                # matched_cal_objs_indexes = language_tools.match_segments_without_order(lines[iline],bas_word_list[se[0]:se[1]+1],2.0)
                curr_bas_line = bas_word_list[se[0]:se[1]+1]
                #print u'base line',u' '.join(curr_bas_line)
                matched_words_base = dibur_hamatchil_matcher.match_text(curr_bas_line, curr_cal_line, char_threshold=0.3,verbose=False)
                word_for_word_se += [(tse[0]+se[0],tse[1]+se[0]) if tse[0] != -1 else tse for tse in matched_words_base]

            matched_word_for_word = dibur_hamatchil_matcher.match_text(bas_word_list, cal_words, char_threshold=0.3, prev_matched_results=word_for_word_se,boundaryFlexibility=2)

            bad_word_offset = 0
            for ical_word,temp_se in enumerate(matched_word_for_word):
                if temp_se[0] == -1:
                    missed_words.append({"word":word_obj_list[ical_word]["word"],"index":ical_word})
                    continue

                #dictionary juggling...
                for i in xrange(temp_se[0],temp_se[1]+1):
                    #in case a cal_words and word_obj_list aren't the same length bc a word got split up
                    """
                    if cal_words[ical_word] != word_obj_list[ical_word-bad_word_offset]["word"]:
                        if ical_word+1 < len(cal_words) and cal_words[ical_word+1] != word_obj_list[ical_word-bad_word_offset+1]["word"]:
                            bad_word_offset += 1
                        continue
                    """
                    cal_word_obj = word_obj_list[ical_word].copy()
                    cal_word_obj["cal_word"] = cal_word_obj["word"]
                    temp_sef_word = temp_out[i]["word"]
                    temp_out[i] = cal_word_obj
                    temp_out[i]["class"] = "talmud"
                    temp_out[i]["word"] = temp_sef_word

            cal_len = len(matched_word_for_word)
            print u"\n-----\nFOUND {}/{} ({}%)".format(cal_len - len(missed_words), cal_len, (1 - round(1.0 * len(missed_words) / cal_len, 4)) * 100)
            #print u"MISSED: {}".format(u" ,".join([u"{}:{}".format(wo["word"], wo["index"]) for wo in missed_words]))
            ical += 1
        """
        #tag 1 pos words if still untagged
        for iwo,word_obj in enumerate(temp_out):
            word = word_obj["word"]
            if word in cal_pos_hashtable:
                if len(cal_pos_hashtable[word]) == 1:
                    temp_out[iwo] = {"word":word,"cal_word":word,"class":"talmud","POS":cal_pos_hashtable[word][0]}
        """
        out += temp_out

        sef_daf = curr_sef_ref.__str__().replace("{} ".format(mesechta),"").encode('utf8')
        doc = {"words": out,"missed_words":missed_words}
        fp = codecs.open("cal_matcher_output/{}/lang_naive_talmud/lang_naive_talmud_{}.json".format(mesechta,sef_daf), "w", encoding='utf-8')
        json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
        fp.close()
def find_changes_between_wiki_and_will():
    import math, bisect
    from data_utilities import dibur_hamatchil_matcher
    from tqdm import tqdm

    out = {}
    total = 0
    missed = 0
    for title in tqdm(library.get_indexes_in_category("Bavli")):
        wiki = Version().load({
            "title": title,
            "versionTitle": "Wikisource Talmud Bavli",
            "language": "he"
        })
        will = Version().load({
            "title": title,
            "versionTitle": "William Davidson Edition - Aramaic",
            "language": "he"
        })
        for isec, (wiki_section,
                   will_section) in enumerate(zip(wiki.chapter, will.chapter)):
            for iseg, (wiki_segment, will_segment) in enumerate(
                    zip(wiki_section, will_section)):
                daf = math.ceil((isec + 1) / 2)
                amud = 'b' if (isec + 1) % 2 == 0 else 'a'
                tref = f"{title} {daf}{amud}:{iseg+1}"
                wiki_removed_indices, wiki_original_words, wiki_removed_chars = get_words_removed(
                    wiki_segment, will_segment)
                wiki_tokenized_words = base_tokenizer(wiki_segment,
                                                      will_segment)
                will_removed_indices, will_original_words, will_removed_chars = get_words_removed(
                    will_segment, will_segment)
                will_tokenized_words = base_tokenizer(will_segment,
                                                      will_segment)

                matched = dibur_hamatchil_matcher.match_text(
                    wiki_tokenized_words, [" ".join(will_tokenized_words)],
                    verbose=False,
                    strict_boundaries=True,
                    place_all=True,
                    with_abbrev_matches=True)
                total += 1
                if matched['matches'][0][0] == -1:
                    # no match
                    missed += 1
                    continue
                for abbrev_list in matched['abbrevs']:
                    for abbrev in abbrev_list:
                        # print('orig', abbrev.gemaraRange, abbrev.rashiRange)
                        wikiWordRange = [
                            x + bisect.bisect_right(wiki_removed_indices, x)
                            for x in abbrev.gemaraRange
                        ]
                        willWordRange = [
                            x + bisect.bisect_right(will_removed_indices, x)
                            for x in abbrev.rashiRange
                        ]
                        wiki_start_char = len(" ".join(
                            wiki_original_words[:wikiWordRange[0]]))
                        if wiki_start_char > 0:
                            # account for space after initial words
                            wiki_start_char += 1
                        wiki_end_char = len(" ".join(
                            wiki_original_words[:wikiWordRange[1] + 1]))
                        wikiCharRange = [wiki_start_char, wiki_end_char]
                        will_start_char = len(" ".join(
                            will_original_words[:willWordRange[0]]))
                        if will_start_char > 0:
                            will_start_char += 1
                        will_end_char = len(" ".join(
                            will_original_words[:willWordRange[1] + 1]))
                        willCharRange = [will_start_char, will_end_char]
                        wiki_removed_chars += [
                            (tuple(wikiCharRange),
                             will_segment[willCharRange[0]:willCharRange[1]])
                        ]
                        # print(f"~{wiki_segment[wikiCharRange[0]:wikiCharRange[1]]}~")
                        # print(f"~{will_segment[willCharRange[0]:willCharRange[1]]}~")
                        # print('after', wikiWordRange, willWordRange)
                        # print(wiki_original_words[wikiWordRange[0]:wikiWordRange[1]+1])
                        # print(will_original_words[willWordRange[0]:willWordRange[1]+1])
                out[tref] = {
                    'wiki': wiki_removed_chars,
                    'will': will_removed_chars
                }
    print('Total', total)
    print('Missed', missed)
    print('Perc', missed / total)
    with open(f"{DATASET_LOC}/wiki_will_changes.json", 'w') as fout:
        json.dump(out, fout)
 def test_empty_comment(self):
     daftext = u'אע״ג שאמרו ככה בלה בלה בלה'.split()
     rashi = [u'', u'אף על גב שאמרו']
     matched = dhm.match_text(daftext, rashi, verbose=True)
示例#10
0
def match_cal_segments(mesechta):
    def tokenize_words(str):
        str = str.replace(u"־", " ")
        str = re.sub(r"</?[a-z]+>", "", str)  # get rid of html tags
        str = re.sub(r"\([^\(\)]+\)", "", str)  # get rid of refs
        str = str.replace("'", '"')
        word_list = filter(bool, re.split(r"[\s\:\-\,\.\;\(\)\[\]\{\}]", str))
        return word_list

    def merge_cal_word_objs(s, e, word_obj_list):
        obj_list = word_obj_list[s:e]
        m_word = u" ".join([o["word"] for o in obj_list])
        m_head_word = u" ".join([o["head_word"] for o in obj_list])
        m_pos_list = [o["POS"] for o in obj_list]
        m_pos = max(set(m_pos_list), key=m_pos_list.count)

        new_obj = obj_list[0].copy()
        new_obj["word"] = m_word
        new_obj["head_word"] = m_head_word
        new_obj["POS"] = m_pos
        return [
            new_obj
        ]  #returns a single element array which will replace a range s:e in the original array

    cal_lines = json.load(open("cal_lines_{}.json".format(mesechta), "r"),
                          encoding="utf8")
    dafs = cal_lines["dafs"]
    lines_by_daf = cal_lines["lines"]

    super_base_ref = Ref(mesechta)
    subrefs = super_base_ref.all_subrefs()
    ical = 0

    for curr_sef_ref in subrefs:
        if curr_sef_ref.is_empty(): continue
        if ical >= len(dafs): break

        daf = dafs[ical]
        print "----- DAF {}  ({}/{})-----".format(daf, ical, len(dafs))

        base_tc = TextChunk(curr_sef_ref, "he")
        bas_word_list = []  # re.split(r"\s+"," ".join(base_text.text))
        for segment in base_tc.text:
            bas_word_list += tokenize_words(segment)

        temp_out = [{"word": w, "class": "unknown"} for w in bas_word_list]

        lines = [[word_obj["word"] for word_obj in temp_line]
                 for temp_line in lines_by_daf[ical]]
        word_obj_list = [
            word_obj for temp_line in lines_by_daf[ical]
            for word_obj in temp_line
        ]
        lines_by_str = [u' '.join(line_array) for line_array in lines]

        curr_cal_ref = Ref("{} {}".format(mesechta, daf))

        out = []
        word_for_word_se = []
        cal_words = []
        missed_words = []

        global_offset = 0
        if curr_sef_ref == curr_cal_ref:
            start_end_map, abbrev_ranges = dibur_hamatchil_matcher.match_text(
                bas_word_list,
                lines_by_str,
                verbose=True,
                word_threshold=0.5,
                with_abbrev_ranges=True)
            for iline, se in enumerate(start_end_map):

                curr_cal_line = lines[iline]

                # if there is an expanded abbrev, concat those words into one element
                if len(abbrev_ranges[iline]) > 0:
                    offset = 0  # account for the fact that you're losing elements in the array as you merge them
                    for ar in abbrev_ranges[iline]:
                        if ar[1] - ar[0] <= 0:
                            continue  #TODO there's an issue with the abbrev func, but i'm too lazy to fix now. sometimes they're zero length
                        curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] = [
                            u' '.join(curr_cal_line[ar[0] - offset:ar[1] + 1 -
                                                    offset])
                        ]
                        word_obj_list[ar[0] - offset + len(cal_words):ar[1] +
                                      1 - offset +
                                      len(cal_words)] = merge_cal_word_objs(
                                          ar[0] - offset + len(cal_words),
                                          ar[1] + 1 - offset + len(cal_words),
                                          word_obj_list)
                        offset += ar[1] - ar[0]
                        global_offset += offset
                        print offset

                cal_words += curr_cal_line
                if se[0] == -1:
                    word_for_word_se += [(-1, -1)
                                         for i in range(len(curr_cal_line))]
                    continue
                # matched_cal_objs_indexes = language_tools.match_segments_without_order(lines[iline],bas_word_list[se[0]:se[1]+1],2.0)
                curr_bas_line = bas_word_list[se[0]:se[1] + 1]

                matched_words_base = dibur_hamatchil_matcher.match_text(
                    curr_bas_line, curr_cal_line, char_threshold=0.4)
                word_for_word_se += [(tse[0] + se[0],
                                      tse[1] + se[0]) if tse[0] != -1 else tse
                                     for tse in matched_words_base]

            matched_word_for_word = dibur_hamatchil_matcher.match_text(
                bas_word_list,
                cal_words,
                char_threshold=0.4,
                prev_matched_results=word_for_word_se)

            for ical_word, temp_se in enumerate(matched_word_for_word):
                if temp_se[0] == -1:
                    missed_words.append({
                        "word":
                        word_obj_list[ical_word]["word"],
                        "index":
                        ical_word
                    })
                    continue

                #dictionary juggling...
                for i in xrange(temp_se[0], temp_se[1] + 1):
                    cal_word_obj = word_obj_list[ical_word].copy()
                    cal_word_obj["cal_word"] = cal_word_obj["word"]
                    temp_sef_word = temp_out[i]["word"]
                    temp_out[i] = cal_word_obj
                    temp_out[i]["class"] = "talmud"
                    temp_out[i]["word"] = temp_sef_word

            cal_len = len(matched_word_for_word)
            print u"\n-----\nFOUND {}/{} ({}%)".format(
                cal_len - len(missed_words), cal_len,
                (1 - round(1.0 * len(missed_words) / cal_len, 4)) * 100)
            print u"MISSED: {}".format(u" ,".join([
                u"{}:{}".format(wo["word"], wo["index"]) for wo in missed_words
            ]))
            ical += 1
        out += temp_out

        sef_daf = curr_sef_ref.__str__().replace("{} ".format(mesechta),
                                                 "").encode('utf8')
        doc = {"words": out, "missed_words": missed_words}
        fp = codecs.open(
            "cal_matcher_output/{}/lang_naive_talmud/lang_naive_talmud_{}.json"
            .format(mesechta, sef_daf),
            "w",
            encoding='utf-8')
        json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
        fp.close()
 def test_duplicate_comment(self):
     daftext = 'אע״ג שאמרו ככה בלה בלה בלה'.split()
     rashi = ['בלה', 'בלה']
     matched = dhm.match_text(daftext, rashi, verbose=True)
示例#12
0
        base_chapter = base_text[chapter_index - 1]
        print 'fixing chapter {}'.format(chapter_index)

        book_text = [
            bleach.clean(segment, tags=[], strip=True)
            for segment in base_chapter
        ]
        seg_indices = first_word_indices(book_text)
        word_list = u' '.join(book_text).split()
        dh_list = [
            re.sub(ur' (\.|:)', ur'\1', p.dh.get_valueOf_())
            for p in comment_chapter.get_phrase()
        ]
        matches = match_text(word_list,
                             dh_list,
                             dh_extract_method=cleaner,
                             place_all=False,
                             strict_boundaries=True,
                             char_threshold=0.4)
        locations.append([
            bisect.bisect_right(seg_indices, match[0])
            if match[0] >= 0 else match[0] for match in matches['matches']
        ])

    commentary.set_verses(locations)
    commentary.correct_phrase_verses()

    if overwrite:
        outfile = filename
    else:
        outfile = '{}_test'.format(filename)
    # with codecs.open('XML/{}.xml'.format(outfile), 'w', 'utf-8') as out:
示例#13
0
    return indices


root = DCXMLsubs.parse("XML/tractate-avot_drabi_natan-xml.xml", silence=True)
base_text = root.getBaseTextArray()[0]
base_text = [bleach.clean(segment, tags=[], strip=True) for segment in base_text]
seg_indices = first_word_indices(base_text)
word_list = u' '.join(base_text).split()
c = root.body.commentaries.commentary[6].chapter[0]
dh_list = [p.dh.get_valueOf_() for p in c.get_phrase()]


def cleaner(input_string):
    assert isinstance(input_string, basestring)
    pattern = u'\u05d5?(\u05db|\u05d2)\u05d5\u05f3?'
    match = re.search(pattern, input_string)
    if match is None:
        return input_string
    if match.start() > 6 and (match.start() > len(input_string) / 2):
        return re.sub(u'\u05d5?(\u05db|\u05d2)\u05d5\u05f3?.*', u'', input_string)
    elif match.start() > 6 and (match.start() < len(input_string) / 2):
        return re.sub(u'.*?{}'.format(pattern), u'', input_string)
    else:
        return re.sub(pattern, u'', input_string)

matches = match_text(word_list, dh_list, dh_extract_method=cleaner, place_all=False)
locations = [bisect.bisect_right(seg_indices, match[0]) if match[0] >= 0 else match[0] for match in matches['matches']]
c.set_verses(locations)
with codecs.open('text.xml', 'w', 'utf-8') as outfile:
    c.export(outfile, level=1)
def convert_mentions_for_alt_version(nikkud_vtitle, mentions_output, manual_changes_file=None, limit=None):
    import json
    from research.knowledge_graph.named_entity_recognition.ner_tagger import Mention
    from data_utilities.dibur_hamatchil_matcher import match_text
    from sefaria.helper.normalization import FunctionNormalizer
    if manual_changes_file is not None:
        changes = srsly.read_json(manual_changes_file)
    with open("sperling_mentions.json", "r") as fin:
        j = json.load(fin)
    # add mentions in db b/c sperling_mentions only includes bonayich-only mentions
    for tl in RefTopicLinkSet({"class": "refTopic", "linkType": "mention", "charLevelData.versionTitle": "William Davidson Edition - Aramaic"}):
        j += [{
            "start": tl.charLevelData['startChar'],
            "end": tl.charLevelData['endChar'],
            "ref": tl.ref,
            "mention": tl.charLevelData['text'],
            "id_matches": [tl.toTopic]
        }]
    mentions_by_seg = defaultdict(list)
    print("TOTAL MENTIONS", len(j))
    for mention in j:
        mentions_by_seg[mention['ref']] += [Mention().add_metadata(**mention)]
    indexes = library.get_indexes_in_category("Bavli") if limit is None else limit

    def get_norm_pos(start, end, s):
        num_to_remove = s.count(':', 0, start)
        return start - num_to_remove, end - num_to_remove
    replace_reg_parens = r"(?:[\u0591-\u05bd\u05bf-\u05c5\u05c7,.:!?״()]+|\s—|\s…)"
    replace_reg = r"(?:[\u0591-\u05bd\u05bf-\u05c5\u05c7,.:!?״]+|\s—|\s…)"
    def get_find_text_to_remove(remove_parens=True):
        return lambda s: [(m, '') for m in re.finditer(replace_reg_parens if remove_parens else replace_reg, s)]

    new_mentions = []
    num_failed = 0
    for mas in tqdm(indexes):
        if Version().load({"title": mas, "versionTitle": nikkud_vtitle, "language": "he"}) is None:
            continue
        index = library.get_index(mas)
        for seg in index.all_segment_refs():
            temp_mentions = mentions_by_seg[seg.normal()]
            if len(temp_mentions) == 0:
                continue
            text = TextChunk(seg, lang='he', vtitle='William Davidson Edition - Aramaic').text
            norm_text = re.sub(':', '', text)
            text_nikkud = TextChunk(seg, lang='he', vtitle=nikkud_vtitle).text
            remove_parens = True
            if re.sub(replace_reg_parens, '', text_nikkud) != text:
                remove_parens = False
            normalizer = FunctionNormalizer(get_find_text_to_remove(remove_parens))
            norm_text_nikkud = re.sub(replace_reg_parens if remove_parens else replace_reg, '', text_nikkud)

            if len(text_nikkud) == 0:
                continue
            mention_indices = [get_norm_pos(mention.start, mention.end, text) for mention in temp_mentions]
            if manual_changes_file is None:
                norm_map = normalizer.get_mapping_after_normalization(text_nikkud)
            else:
                temp_wiki_changes = changes.get(seg.normal(), {}).get('wiki', [])

                temp_will_changes = changes.get(seg.normal(), {}).get('will', [])
                temp_wiki_changes = list(filter(lambda x: x not in temp_will_changes, temp_wiki_changes))
                temp_wiki_changes.sort(key=lambda x: x[0][0])
                for tc in temp_wiki_changes:
                    tc[0][0] += 1
                    tc[0][1] += 1
                norm_map = normalizer.get_mapping_after_normalization(text_nikkud, removal_list=temp_wiki_changes)
  
            mention_indices = normalizer.convert_normalized_indices_to_unnormalized_indices(mention_indices, norm_map)
            temp_new_mentions = []
            for mention, (unnorm_start, unnorm_end) in zip(temp_mentions, mention_indices):
                if manual_changes_file is None:
                    new_mention = re.sub(replace_reg_parens if remove_parens else replace_reg, '', text_nikkud[unnorm_start:unnorm_end])
                else:
                    new_mention = text_nikkud[unnorm_start:unnorm_end]
                try:
                    if len(new_mention) == 0:
                        print("ZERO LENGTH MENTION", mention.mention, seg.normal())
                    assert len(new_mention) > 0
                    if manual_changes_file is None:
                        assert new_mention == mention.mention, f"'{new_mention} != {mention.mention}' {unnorm_start} {unnorm_end}"
                    else:
                        for offset in [0, -1, 1, -2, 2]:
                            new_mention = text_nikkud[unnorm_start+offset:unnorm_end+offset]
                            # likely to be abbreviations in new_mention. use dh matcher to see if they're 'equivalent'
                            old_mention_comparison = mention.mention
                            if new_mention.startswith('א"ר'):
                                old_mention_comparison = "אמר " + old_mention_comparison
                            if new_mention.startswith('"'):
                                # middle of abbrev
                                new_mention_comparison = new_mention[1:2] + "'" + new_mention[2:]
                            else:
                                new_mention_comparison = new_mention
                            new_words = new_mention_comparison.split()
                            matched = match_text(new_words, [old_mention_comparison], with_abbrev_matches=True, daf_skips=0, rashi_skips=0, overall=0)
                            if matched['matches'][0][0] != -1:
                                # need look at actual match and figure out if any words are missing
                                # recalculate unnorm_start and unnorm_end to leave out these words. Test case: Arakhin 5a:18
                                istart_word, iend_word = matched['matches'][0]
                                start_text = " ".join(new_words[:istart_word])
                                start_offset = len(start_text) + (1 if len(start_text) > 0 else 0)  # add 1 to account for space right after start_text
                                end_text = " ".join(new_words[iend_word+1:])
                                end_offset = len(end_text) + (1 if len(end_text) > 0 else 0)
                                unnorm_start += offset + start_offset
                                unnorm_end += offset - end_offset
                                break
                        # move unnorm_start and end to nearest word break
                        if unnorm_end == len(text_nikkud) + 1:
                            # one too big
                            unnorm_end -= 1
                        if unnorm_end > len(text_nikkud):
                            # too big give up
                            # print("UPDATE END TOO BIG. GIVE UP...", mention.mention, seg.normal())
                            assert False
                        if text_nikkud[unnorm_start] in {' ', ':'}:
                            # move forward by one
                            unnorm_start += 1
                        if text_nikkud[unnorm_end-1] in {' ', ':'}:
                            unnorm_end -= 1
                        start_nearest_break = max(text_nikkud.rfind(' ', 0, unnorm_start), text_nikkud.rfind(':', 0, unnorm_start))
                        end_nearest_break_match = re.search(r'[\s:]', text_nikkud[unnorm_end:])
                        end_nearest_break = (end_nearest_break_match.start() + unnorm_end) if end_nearest_break_match is not None else -1
                        if start_nearest_break != -1:
                            unnorm_start = start_nearest_break + 1
                        elif unnorm_start != 0:
                            # if couldn't find space before, must be at beginning
                            # print("UPDATE START", mention.mention, seg.normal())
                            unnorm_start = 0
                        if end_nearest_break != -1:
                            unnorm_end = end_nearest_break
                        elif unnorm_end != len(text_nikkud):
                            # print("UPDATE END", mention.mention, seg.normal())
                            unnorm_end = len(text_nikkud)
                        assert matched['matches'][0][0] != -1
                    mention.add_metadata(start=unnorm_start, end=unnorm_end, mention=text_nikkud[unnorm_start:unnorm_end])
                    temp_new_mentions += [mention]
                except AssertionError:
                    norm_start, norm_end = get_norm_pos(mention.start, mention.end, text)
                    snip_size = 10
                    start_snip_naive = norm_start - snip_size if norm_start >= snip_size else 0
                    start_snip = norm_text.rfind(" ", 0, start_snip_naive)
                    if start_snip == -1:
                        start_snip = start_snip_naive
                    end_snip_naive = norm_end + snip_size if norm_end + snip_size <= len(norm_text) else len(norm_text)
                    end_snip = norm_text.find(" ", end_snip_naive)
                    if end_snip == -1:
                        end_snip = end_snip_naive
                    snippet = f"{norm_text[start_snip:norm_start]}~{norm_text[norm_start:norm_end]}~{norm_text[norm_end:end_snip]}"

                    new_norm_start, new_norm_end = get_rabbi_char_loc_list([snippet], norm_text_nikkud)[0]
                    if new_norm_start is None:
                        # print("new_norm_start is None")
                        num_failed += 1
                        continue
                    new_start, new_end = normalizer.convert_normalized_indices_to_unnormalized_indices([(new_norm_start, new_norm_end)], norm_map)[0]
                    new_mention = re.sub(replace_reg_parens if remove_parens else replace_reg, '', text_nikkud[new_start:new_end])
                    try:
                        assert new_mention == mention.mention, f"'{new_mention} != {mention.mention}' {unnorm_start} {unnorm_end}"
                        mention.add_metadata(start=new_start, end=new_end, mention=text_nikkud[new_start:new_end])
                        temp_new_mentions += [mention]
                    except AssertionError:
                        num_failed += 1
                    # get_rabbi_char_pos using context and text_nikkud
                    # get_unnormalized pos
            new_mentions += temp_new_mentions
    out = [m.serialize(delete_keys=['versionTitle', 'language']) for m in new_mentions]
    with open(f"{mentions_output}", "w") as fout:
        json.dump(out, fout, ensure_ascii=False, indent=2)
    print("NUM FAILED", num_failed)