示例#1
0
def match():
    link_list = []
    num_matched = 0
    num_searched = 0

    rashba = Ref("Rashi on Berakhot")
    gemara = library.get_index("Berakhot")

    rashba_ref_list = [ref for ref in rashba.all_subrefs() if ref.text('he').text != []]
    gemara_ref_list = gemara.all_section_refs()

    gemara_ind = 0
    for rashba_ref in rashba_ref_list:
        while gemara_ind < len(gemara_ref_list) and gemara_ref_list[gemara_ind].normal_last_section() != rashba_ref.normal_last_section():
            gemara_ind += 1
        gemara_ref = rashba_ref_list[gemara_ind]

        rashba_tc = TextChunk(rashba_ref,"he")
        gemara_tc = TextChunk(gemara_ref,"he")

        ref_map_with_abbrevs = dibur_hamatchil_matcher.match_ref(rashba_tc, gemara_tc,base_tokenizer=tokenize_words,
                                                                 dh_extract_method=dh_extraction_method, verbose=True,
                                                                 with_abbrev_matches=True)
        ref_map = [(tup[0], tup[1]) for tup in ref_map_with_abbrevs]

        temp_link_list = [l for l in ref_map if not l[0] is None and not l[1] is None]
        link_list += temp_link_list
        unlink_list = [ul[1] for ul in ref_map if ul[0] is None or ul[1] is None]
        for r in ref_map:
            if not r[0] is None: num_matched += 1

        num_searched += len(ref_map)

        print "MATCHES - {}".format(ref_map)
        print "ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100)
示例#2
0
def find_subref(sheet_text, ref, lang, vtitle=None, tried_adding_refs_at_end_of_section=False, **kwargs):
    try:
        tc = TextChunk(ref, lang, vtitle=vtitle)
        matches = match_ref(tc, [sheet_text], tokenizer, dh_extract_method=clean, with_num_abbrevs=False, lang=lang, rashi_skips=2, dh_split=lambda dh: re.split(ur"\s*\.\.\.\s*", dh), **kwargs)
    except IndexError:
        # thrown if base text is empty
        matches = {"matches": []}
    except ValueError:
        matches = {"matches": []}
    found_ref = None
    for r in matches["matches"]:
        if r is not None:
            found_ref = r
            break
    if found_ref is None:
        if ref.primary_category == "Tanakh" and lang == "en" and vtitle is None:
            return find_subref(sheet_text, ref, lang, "The Holy Scriptures: A New Translation (JPS 1917)")
        elif ref.primary_category == "Talmud" and vtitle is None:
            if lang == "he":
                return find_subref(sheet_text, ref, lang, "Wikisource Talmud Bavli")
            else:
                return find_subref(sheet_text, ref, lang, "Sefaria Community Translation")
        elif ref.primary_category == "Talmud" and ref.is_section_level() and not tried_adding_refs_at_end_of_section:
            # you tried wiki and it didn't work
            # you're running out of options, what do you do?
            # add first and last seg from prev and next daf!!!
            prev_daf = ref.prev_section_ref()
            next_daf = ref.next_section_ref()
            start_ref = prev_daf.all_segment_refs()[-1] if prev_daf is not None else ref
            end_ref = next_daf.all_segment_refs()[0] if next_daf is not None else ref
            if end_ref.is_range():
                end_ref = end_ref.ending_ref()
            new_ref = start_ref.to(end_ref)
            return find_subref(sheet_text, new_ref, lang, tried_adding_refs_at_end_of_section=True)
    return found_ref
示例#3
0
def find_subref(sheet_text, ref, lang, vtitle=None, tried_adding_refs_at_end_of_section=False, **kwargs):
    try:
        tc = TextChunk(ref, lang, vtitle=vtitle)
        matches = match_ref(tc, [sheet_text], tokenizer, dh_extract_method=clean, with_num_abbrevs=False, lang=lang, rashi_skips=2, dh_split=lambda dh: re.split(r"\s*\.\.\.\s*", dh), **kwargs)
    except IndexError:
        # thrown if base text is empty
        matches = {"matches": []}
    except ValueError:
        matches = {"matches": []}
    found_ref = None
    for r in matches["matches"]:
        if r is not None:
            found_ref = r
            break
    if found_ref is None:
        if ref.primary_category == "Tanakh" and lang == "en" and vtitle is None:
            return find_subref(sheet_text, ref, lang, "The Holy Scriptures: A New Translation (JPS 1917)")
        elif ref.primary_category == "Talmud" and vtitle is None:
            if lang == "he":
                return find_subref(sheet_text, ref, lang, "Wikisource Talmud Bavli")
            else:
                return find_subref(sheet_text, ref, lang, "Sefaria Community Translation")
        elif ref.primary_category == "Talmud" and ref.is_section_level() and not tried_adding_refs_at_end_of_section:
            # you tried wiki and it didn't work
            # you're running out of options, what do you do?
            # add first and last seg from prev and next daf!!!
            prev_daf = ref.prev_section_ref()
            next_daf = ref.next_section_ref()
            start_ref = prev_daf.all_segment_refs()[-1] if prev_daf is not None else ref
            end_ref = next_daf.all_segment_refs()[0] if next_daf is not None else ref
            if end_ref.is_range():
                end_ref = end_ref.ending_ref()
            new_ref = start_ref.to(end_ref)
            return find_subref(sheet_text, new_ref, lang, tried_adding_refs_at_end_of_section=True)
    return found_ref
def get_rabbi_mention_segments(rows_by_mas, limit=None):
    total = 0
    missed = 0
    new_rows = []

    indexes = library.get_indexes_in_category("Bavli") if limit is None else limit
    for mas in tqdm(indexes):
        for i, amud in enumerate(rows_by_mas[mas]):
            curr_amud = amud[0][' Amud'].lower()
            tc = Ref(f"{mas} {curr_amud}").text("he", vtitle="William Davidson Edition - Aramaic")
            matches = match_ref(tc, [r[' Snippet'] for r in amud], base_tokenizer, dh_extract_method=dh_extract_method, with_num_abbrevs=False, place_all=True, place_consecutively=True, verbose=False)
            total+=len(matches['matches'])
            rabbi_match_segs = []
            for j, m in enumerate(matches['matches']):
                snippet = amud[j][' Snippet']
                rabbi_match_segs += [get_rabbi_seg(m, snippet)]
                if m is None:
                    missed += 1

            for j, r in enumerate(amud):
                seg, context = rabbi_match_segs[j]
                new_rows += [{
                    "Segment": None if seg is None else seg.normal(),
                    "Context": context,
                    "Book": mas,
                    "Bonayich ID": r[" Rabbi ID after Link"]
                }]
    print(missed, total)
    return new_rows
示例#5
0
def match():
    mb = library.get_index("Mishnah Berurah")
    oc = library.get_index("Shulchan Arukh, Orach Chayim")

    mbRefList = mb.all_section_refs()
    ocRefList = oc.all_section_refs()
    mbInd = 0

    num_matched = 0
    num_searched = 0

    link_list = []
    log = open("mishnah_berurah.log","w")
    rt_log = open("rashei_tevot.csv","w")
    rt_log_csv = unicodecsv.DictWriter(rt_log, fieldnames=["abbrev","expanded","context_before","context_after"])
    rt_log_csv.writeheader()
    for ocRef in ocRefList:
        ocSiman = getSimanNum(ocRef)
        while getSimanNum(mbRefList[mbInd]) != ocSiman:
            mbInd += 1
        mbRef = mbRefList[mbInd]
        mbSiman = getSimanNum(mbRef)
        print "----- SIMAN {} -----".format(ocSiman)
        log.write("----- SIMAN {} -----\n".format(ocSiman))
        octc = TextChunk(ocRef,"he")
        mbtc = TextChunk(mbRef,"he")
        try:
            matched = dibur_hamatchil_matcher.match_ref(octc,mbtc,base_tokenizer=base_tokenizer,dh_extract_method=dh_extraction_method,verbose=True,with_abbrev_matches=True,rashi_filter=rashi_filter)
        except ValueError:
            continue
        if 'comment_refs' not in matched:
            continue

        ref_map = [(base,comment) for base,comment in zip(matched['matches'],matched['comment_refs'])]
        abbrevs = [am for seg in matched['abbrevs'] for am in seg]
        for am in abbrevs:
            rt_log_csv.writerow({'abbrev':dibur_hamatchil_matcher.cleanAbbrev(am.abbrev), 'expanded':u' '.join(am.expanded), 'context_before':u' '.join(am.contextBefore), 'context_after':u' '.join(am.contextAfter)})

        temp_link_list = [l for l in ref_map if not l[0] is None and not l[1] is None]
        link_list += temp_link_list
        unlink_list = [ul[1] for ul in ref_map if ul[0] is None or ul[1] is None]
        for r in ref_map:
            if not r[0] is None: num_matched+=1

        num_searched += len(ref_map)

        print "MATCHES - {}".format(ref_map)
        print "ACCURACY - {}%".format(round(1.0*num_matched/num_searched,5)*100)
        log.write("MATCHES - {}\n".format(temp_link_list))
        log.write("NOT FOUND - {}\n".format(unlink_list))
        log.write("ACCURACY - {}%\n".format(round(1.0*num_matched/num_searched,5)*100))


    doc = {"link_list":[[link[0].normal(),link[1].normal()] for link in link_list]}
    fp = codecs.open("mishnah_berurah_links.json", "w",encoding='utf-8')
    json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
    fp.close()
    log.close()
    rt_log.close()
示例#6
0
def match():
    mb = library.get_index("Mishnah Berurah")
    oc = library.get_index("Shulchan Arukh, Orach Chayim")

    mbRefList = mb.all_section_refs()
    ocRefList = oc.all_section_refs()
    mbInd = 0

    num_matched = 0
    num_searched = 0

    link_list = []
    log = open("mishnah_berurah.log", "w")
    for ocRef in ocRefList:
        ocSiman = getSimanNum(ocRef)
        while getSimanNum(mbRefList[mbInd]) != ocSiman:
            mbInd += 1
        mbRef = mbRefList[mbInd]
        mbSiman = getSimanNum(mbRef)
        print "----- SIMAN {} -----".format(ocSiman)
        log.write("----- SIMAN {} -----\n".format(ocSiman))
        octc = TextChunk(ocRef, "he")
        mbtc = TextChunk(mbRef, "he")

        ref_map = dibur_hamatchil_matcher.match_ref(
            octc,
            mbtc,
            base_tokenizer=base_tokenizer,
            dh_extract_method=dh_extraction_method,
            verbose=True)
        temp_link_list = [
            l for l in ref_map if not l[0] is None and not l[1] is None
        ]
        link_list += temp_link_list
        unlink_list = [
            ul[1] for ul in ref_map if ul[0] is None or ul[1] is None
        ]
        for r in ref_map:
            if not r[0] is None: num_matched += 1

        num_searched += len(ref_map)

        print "MATCHES - {}".format(ref_map)
        print "ACCURACY - {}%".format(
            round(1.0 * num_matched / num_searched, 5) * 100)
        log.write("MATCHES - {}\n".format(temp_link_list))
        log.write("NOT FOUND - {}\n".format(unlink_list))
        log.write("ACCURACY - {}%\n".format(
            round(1.0 * num_matched / num_searched, 5) * 100))

    doc = {
        "link_list":
        [[link[0].normal(), link[1].normal()] for link in link_list]
    }
    fp = codecs.open("mishnah_berurah_links.json", "w", encoding='utf-8')
    json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
    fp.close()
    log.close()
示例#7
0
 def test_dh_matcher(self):
     from data_utilities.dibur_hamatchil_matcher import match_ref
     base_tokenizer = lambda x: x.split()
     base_text = Ref(u'גיטין ז').text('he')
     n = 100  # len(''.join(base_text.text).split())
     comments = [u'צריך @33 למימרינהו בניחותא.', u'מי שיש לו צעקת לגימא על חבירו ודומם שוכן בסנה עושה לו דין. ', u'@11הדבר @33 יצא מפי רבי אלעזר ונתנוהו לגניבה בקולר. ', u'א“ל @33 האלהים מדרבנן אלא חסדא שמך וחסדאין מילך. ', u'מאי @33 זאת לא זאת. ', u'''דרש @33 רב עוירא וכו' מ“ד כה אמר ה' אם שלמים וכן רבים וכו' אם רואה אדם שמזונותיו מצומצמין יעשה מהן צדקה וכ“ש כשהן מרובין וכו‘.''']
     results = match_ref(base_text, comments, base_tokenizer, prev_matched_results=None, dh_extract_method=lambda x: x,verbose=False, word_threshold=0.27,char_threshold=0.2,
           with_abbrev_matches=False,with_num_abbrevs=True,boundaryFlexibility=n,dh_split=None, rashi_filter=None, strict_boundaries=None, place_all=False,
           create_ranges=False, place_consecutively=False, daf_skips=2, rashi_skips=1, overall=2, lang="he")
     print results['matches']
示例#8
0
def match():
    link_list = []
    num_matched = 0
    num_searched = 0

    rashba = library.get_index("Rashba on Berakhot")
    gemara = library.get_index("Berakhot")

    rashba_ref_list =  rashba.all_section_refs()
    gemara_ref_list = gemara.all_section_refs()

    gemara_ind = 0
    for rashba_ref in rashba_ref_list:
        while gemara_ref_list[gemara_ind].normal_last_section() != rashba_ref.normal_last_section():
            gemara_ind += 1
        gemara_ref = gemara_ref_list[gemara_ind]

        rashba_tc = TextChunk(rashba_ref,"he")

        # let's extend the range of gemara_tc to account for weird rashba stuff
        num_refs_to_expand = 2

        gemara_ref_before = gemara_ref.prev_section_ref()
        gemara_ref_after = gemara_ref.next_section_ref()
        if gemara_ref_before and len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand:
            gemara_ref = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(gemara_ref)
        if gemara_ref_after and len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand:
            gemara_ref = gemara_ref.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1])

        gemara_tc = TextChunk(gemara_ref,"he")





        ref_map_with_abbrevs = dibur_hamatchil_matcher.match_ref(gemara_tc, rashba_tc, base_tokenizer=tokenize_words,
                                                                 dh_extract_method=dh_extraction_method, verbose=True,
                                                                 with_abbrev_matches=True,dh_split=dh_split,
                                                                 boundaryFlexibility=10000,
                                                                 rashi_filter=rashi_filter)
        ref_map = [(tup[0], tup[1]) for tup in ref_map_with_abbrevs]

        temp_link_list = [l for l in ref_map if not l[0] is None and not l[1] is None]
        link_list += temp_link_list
        unlink_list = [ul[1] for ul in ref_map if ul[0] is None or ul[1] is None]
        for r in ref_map:
            if not r[0] is None: num_matched += 1

        num_searched += len(ref_map)

        print "MATCHES - {}".format(ref_map)
        print "ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100)
 def test_mb(self):
     simanim = [1, 51, 202]
     all_matched = []
     for sim in simanim:
         ocRef = Ref('Shulchan Arukh, Orach Chayim {}'.format(sim))
         mbRef = Ref('Mishnah Berurah {}'.format(sim))
         octc = TextChunk(ocRef,"he")
         mbtc = TextChunk(mbRef,"he")
         matched = dhm.match_ref(octc, mbtc, base_tokenizer=mb_base_tokenizer, dh_extract_method=mb_dh_extraction_method, with_abbrev_matches=True)
         matched['abbrevs'] = [[unicode(am) for am in seg] for seg in matched['abbrevs']]
         all_matched.append(matched)
     #pickle.dump(all_matched, open('mb_matched.pkl','wb'))
     comparison = pickle.load(open('mb_matched.pkl', 'rb'))
     #comparison = [comparison[1]]
     assert comparison == all_matched
示例#10
0
 def test_mb(self):
     simanim = [1, 51, 202]
     all_matched = []
     for sim in simanim:
         ocRef = Ref('Shulchan Arukh, Orach Chayim {}'.format(sim))
         mbRef = Ref('Mishnah Berurah {}'.format(sim))
         octc = TextChunk(ocRef,"he")
         mbtc = TextChunk(mbRef,"he")
         matched = dhm.match_ref(octc, mbtc, base_tokenizer=mb_base_tokenizer, dh_extract_method=mb_dh_extraction_method, with_abbrev_matches=True)
         matched['abbrevs'] = [[unicode(am) for am in seg] for seg in matched['abbrevs']]
         all_matched.append(matched)
     #pickle.dump(all_matched, open('mb_matched.pkl','wb'))
     comparison = pickle.load(open('mb_matched.pkl', 'rb'))
     #comparison = [comparison[1]]
     assert comparison == all_matched
示例#11
0
    def match(self, base_ref, comment_ref, verbose=False):
        assert isinstance(base_ref, Ref)
        assert isinstance(comment_ref, Ref)

        base_version = "Tanach with Text Only"
        mei_version = 'Divrei Emet, Zalkowa 1801'
        word_count = base_ref.text('he', base_version).word_count()

        return match_ref(base_ref.text('he', base_version),
                         comment_ref.text('he', mei_version),
                         self._base_tokenizer,
                         dh_extract_method=self._dh_extract_method,
                         verbose=verbose,
                         rashi_filter=self._filter,
                         char_threshold=0.4,
                         boundaryFlexibility=word_count)
示例#12
0
def match():
    link_list = []
    num_matched = 0
    num_searched = 0

    rashba = Ref("Rashi on Berakhot")
    gemara = library.get_index("Berakhot")

    rashba_ref_list = [ref for ref in rashba.all_subrefs() if ref.text("he").text != []]
    gemara_ref_list = gemara.all_section_refs()

    gemara_ind = 0
    for rashba_ref in rashba_ref_list:
        while (
            gemara_ind < len(gemara_ref_list)
            and gemara_ref_list[gemara_ind].normal_last_section() != rashba_ref.normal_last_section()
        ):
            gemara_ind += 1
        gemara_ref = rashba_ref_list[gemara_ind]

        rashba_tc = TextChunk(rashba_ref, "he")
        gemara_tc = TextChunk(gemara_ref, "he")

        ref_map_with_abbrevs = dibur_hamatchil_matcher.match_ref(
            rashba_tc,
            gemara_tc,
            base_tokenizer=tokenize_words,
            dh_extract_method=dh_extraction_method,
            verbose=True,
            with_abbrev_matches=True,
        )
        ref_map = [(tup[0], tup[1]) for tup in ref_map_with_abbrevs]

        temp_link_list = [l for l in ref_map if not l[0] is None and not l[1] is None]
        link_list += temp_link_list
        unlink_list = [ul[1] for ul in ref_map if ul[0] is None or ul[1] is None]
        for r in ref_map:
            if not r[0] is None:
                num_matched += 1

        num_searched += len(ref_map)

        print "MATCHES - {}".format(ref_map)
        print "ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100)
示例#13
0
def link_drashot():
    matched=0.00
    total=0.00
    errored = []
    not_machted = []
    start_parsha_parse = False
    for parsha in eng_parshiot:
        if "B" in parsha:
            start_parsha_parse=True
        if start_parsha_parse and "Miketz" not in parsha and "Pekudei" not in parsha:
            parsha_chunk = TextChunk(Ref("Parashat "+parsha),"he","Tanach with Text Only")
            bih_chunk = TextChunk(Ref('Ben Ish Hai, Drashot, '+parsha),"he","NEW VERSION")
            word_count = parsha_chunk.word_count()
            bih_links = match_ref(parsha_chunk,bih_chunk,base_tokenizer,dh_extract_method=dh_extract_method,verbose=True,rashi_filter=_filter, boundaryFlexibility=word_count-1, char_threshold=1.8)
            for base, comment in zip(bih_links["matches"],bih_links["comment_refs"]):
                print "B",base,"C", comment
                print bih_links.get('refs')
                if base:
                    link = (
                            {
                            "refs": [
                                     base.normal(),
                                     comment.normal(),
                                     ],
                            "type": "commentary",
                            "auto": True,
                            "generated_by": "sterling_ben_ish_hai_linker"
                            })
                    post_link(link, weak_network=True)    
                    matched+=1
                #if there is no match and there is only one comment, default will be to link it to that comment    
                else:
                    not_machted.append(parsha)
                total+=1
    if total!=0:
        pm = matched/total
        print "Percent matched: "+str(pm)
    else:
        print "None matched :("
    print "Not Matched:"
    for nm in not_machted:
        print nm
示例#14
0
    def match(self, base_ref, comment_ref, verbose=False):
        assert isinstance(base_ref, Ref)
        assert isinstance(comment_ref, Ref)

        if self.version_map.get(base_ref.book) is None:
            self.set_version_by_category(
                base_ref.book
            )  # Books that can't be linked by category need to be set manually
        base_version, mei_version = self.version_map[
            base_ref.book], self.version_map[comment_ref.index.title]
        word_count = base_ref.text('he', base_version).word_count()

        return match_ref(base_ref.text('he', base_version),
                         comment_ref.text('he', mei_version),
                         self._base_tokenizer,
                         dh_extract_method=self._dh_extract_method,
                         verbose=verbose,
                         rashi_filter=self._filter,
                         char_threshold=0.4,
                         boundaryFlexibility=word_count)
示例#15
0
 def test_mb(self):
     simanim = [1, 51, 202]
     all_matched = []
     for sim in simanim:
         ocRef = Ref('Shulchan Arukh, Orach Chayim {}'.format(sim))
         mbRef = Ref('Mishnah Berurah {}'.format(sim))
         octc = TextChunk(ocRef,"he")
         mbtc = TextChunk(mbRef,"he")
         matched = dhm.match_ref(octc, mbtc, base_tokenizer=mb_base_tokenizer, dh_extract_method=mb_dh_extraction_method, with_abbrev_matches=True)
         # store dict in json serializable format
         matched[u'abbrevs'] = [[unicode(am) for am in seg] for seg in matched[u'abbrevs']]
         matched[u'comment_refs'] = [unicode(r.normal()) if r is not None else r for r in matched[u'comment_refs']]
         matched[u'matches'] = [r.normal() if r is not None else r for r in matched[u'matches']]
         matched[u'match_word_indices'] = [list(tup) for tup in matched[u'match_word_indices']]
         matched[u'match_text'] = [list(tup) for tup in matched[u'match_text']]
         all_matched.append(matched)
     #json.dump(all_matched, codecs.open('mb_matched.json', 'wb', encoding='utf8'))
     comparison = json.load(codecs.open('mb_matched.json', 'rb', encoding='utf8'))
     for a_siman, b_siman in zip(all_matched, comparison):
         for k, v in a_siman.items():
             assert v == b_siman[k]
示例#16
0
def get_matches_for_dict_and_link(dh_dict, base_text_title, commentary_title, talmud=True, lang='he', word_threshold=0.27, server="", rashi_filter=None, dh_extract_method=lambda x: x):
    def base_tokenizer(str):
        str_list = str.split(" ")
        return [str for str in str_list if len(str) > 0]


    assert len(server) > 0, "Please specify a server"
    results = {}
    links = []
    matched = 0
    total = 0
    for daf in dh_dict:
        print daf
        dhs = dh_dict[daf]
        if talmud:
            base_text_ref = "{} {}".format(base_text_title, AddressTalmud.toStr("en", daf))
            comm_ref = "{} on {} {}".format(commentary_title, base_text_title, AddressTalmud.toStr("en", daf))
        else:
            base_text_ref = "{} {}".format(base_text_title, daf)
            comm_ref = "{} on {} {}".format(commentary_title, base_text_title, daf)
        base_text = TextChunk(Ref(base_text_ref), lang=lang)
        comm_text = TextChunk(Ref(comm_ref), lang=lang)
        results[daf] = match_ref(base_text, comm_text, base_tokenizer=base_tokenizer, word_threshold=word_threshold, rashi_filter=rashi_filter, dh_extract_method=dh_extract_method)["matches"]
        for count, link in enumerate(results[daf]):
            if link:
                base_end = link.normal()
                comm_end = "{} on {} {}:{}".format(commentary_title, base_text_title, AddressTalmud.toStr("en", daf), count+1)
                links.append({
                    "refs": [base_end, comm_end],
                    "auto": True,
                    "type": "commentary",
                    "generated_by": commentary_title+base_text_title
                })
                matched += 1
            total += 1
    print "Matched: {}".format(matched)
    print "Total {}".format(total)
    post_link(links, server=server)

    return results
 def test_mb(self):
     simanim = [1, 51, 202]
     all_matched = []
     for sim in simanim:
         ocRef = Ref('Shulchan Arukh, Orach Chayim {}'.format(sim))
         mbRef = Ref('Mishnah Berurah {}'.format(sim))
         octc = TextChunk(ocRef, "he")
         mbtc = TextChunk(mbRef, "he")
         matched = dhm.match_ref(octc,
                                 mbtc,
                                 base_tokenizer=mb_base_tokenizer,
                                 dh_extract_method=mb_dh_extraction_method,
                                 with_abbrev_matches=True)
         # store dict in json serializable format
         matched['abbrevs'] = [[str(am) for am in seg]
                               for seg in matched['abbrevs']]
         matched['comment_refs'] = [
             str(r.normal()) if r is not None else r
             for r in matched['comment_refs']
         ]
         matched['matches'] = [
             r.normal() if r is not None else r for r in matched['matches']
         ]
         matched['match_word_indices'] = [
             list(tup) for tup in matched['match_word_indices']
         ]
         matched['match_text'] = [
             list(tup) for tup in matched['match_text']
         ]
         all_matched.append(matched)
     #json.dump(all_matched, codecs.open('mb_matched.json', 'wb', encoding='utf8'))
     comparison = json.load(
         codecs.open('mb_matched.json', 'rb', encoding='utf8'))
     for a_siman, b_siman in zip(all_matched, comparison):
         for k, v in list(a_siman.items()):
             assert v == b_siman[k]
示例#18
0
                    break

            def base_tokenizer(str):
                str = re.sub(r"\([^\(\)]+\)", "", str)
                word_list = re.split(r"\s+", str)
                word_list = [w for w in word_list if w]  # remove empty strings
                return word_list

            def dh_extraction_method(str):
                m = re.match(r"([^\.]+\.\s)?([^–]+)\s–", str)
                if m:
                    return m.group(2)
                else:
                    return ""

            yo = dibur_hamatchil_matcher.match_ref(
                base_tc, comment_list, base_tokenizer=base_tokenizer)
            for i, yoyo in enumerate(yo):
                if yoyo is None:
                    num_missed += 1

                if i > 0 and not yo[i - 1] is None and not yoyo is None:
                    prange = yo[i - 1].range_list()
                    nrange = yoyo.range_list()

                    if prange[-1] == nrange[0]:
                        print("{} is split".format(nrange[0]))
                        num_split += 1

                total_sef += len(yoyo.range_list()) if yoyo else 0
                total_koren += 1
示例#19
0
def match():
    mes = "Sanhedrin"
    yr  = library.get_index("Yad Ramah on {}".format(mes))
    gem = library.get_index("{}".format(mes))

    yrRefList  = yr.all_section_refs()[:5]
    gemRefList = gem.all_section_refs()
    gemInd = 0

    num_matched = 0
    num_searched = 0

    link_list = []
    log = open("yad_ramah.log","w")
    rt_log = open("yad_ramah_rashei_tevot.csv","w")
    rt_log_csv = unicodecsv.DictWriter(rt_log, fieldnames=["abbrev","expanded","context_before","context_after"])
    rt_log_csv.writeheader()
    for yrRef in yrRefList:
        while gemRefList[gemInd].sections[0] != yrRef.sections[0]:
            gemInd += 1
        gemRef = gemRefList[gemInd]
        print("----- {} -----".format(gemRef))
        log.write("----- {} -----\n".format(gemRef))

        yrtc = TextChunk(yrRef,'he')

        # let's extend the range of gemara_tc to account for weird rashba stuff
        num_refs_to_expand = 2

        gemara_ref_before = gemRef.prev_section_ref()
        gemara_ref_after = gemRef.next_section_ref()
        if gemara_ref_before and len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand:
            gemRef = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(gemRef)
        if gemara_ref_after and len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand:
            gemRef = gemRef.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1])

        gemtc = TextChunk(gemRef,'he')

        def base_tokenizer(string):
            return dhm.get_maximum_dh(gemtc,string,max_dh_len=6)


        matched = dhm.match_ref(gemtc, yrtc, base_tokenizer=base_tokenizer,
                                                    dh_extract_method=dh_extraction_method, verbose=True,
                                                    with_abbrev_matches=True, rashi_filter=rashi_filter)

        ref_map = [(base, comment) for base, comment in zip(matched['matches'], matched['comment_refs'])]
        abbrevs = [am for seg in matched['abbrevs'] for am in seg]
        for am in abbrevs:
            rt_log_csv.writerow(
                {'abbrev': dhm.cleanAbbrev(am.abbrev), 'expanded': ' '.join(am.expanded),
                 'context_before': ' '.join(am.contextBefore), 'context_after': ' '.join(am.contextAfter)})

        temp_link_list = [l for l in ref_map if not l[0] is None and not l[1] is None]
        link_list += temp_link_list
        unlink_list = [ul[1] for ul in ref_map if ul[0] is None or ul[1] is None]
        for r in ref_map:
            if not r[0] is None: num_matched += 1

        num_searched += len(ref_map)

        print("MATCHES - {}".format(ref_map))
        print("ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100))
        log.write("MATCHES - {}\n".format(temp_link_list))
        log.write("NOT FOUND - {}\n".format(unlink_list))
        log.write("ACCURACY - {}%\n".format(round(1.0 * num_matched / num_searched, 5) * 100))
    doc = {"link_list": [[link[0].normal(), link[1].normal()] for link in link_list]}
    fp = codecs.open("yad_ramah_links.json", "w", encoding='utf-8')
    json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
    fp.close()
    log.close()
    rt_log.close()
示例#20
0
    i_rif = 0
    i_gem = 0
    last_update_was_zero = False
    while i_rif < len(rif_segs) - rif_window and i_gem < len(
            gem_segs) - gem_window:
        temp_rif_tc = rif_segs[i_rif].to(rif_segs[i_rif +
                                                  rif_window]).text("he")
        temp_gem_tc = gem_segs[i_gem].to(gem_segs[i_gem + gem_window]).text(
            lang="he", vtitle=vtitle)
        print("{}, {}, {}".format(temp_rif_tc, temp_gem_tc, len(links)))

        matched = dibur_hamatchil_matcher.match_ref(
            temp_gem_tc,
            temp_rif_tc,
            base_tokenizer=tokenize_words,
            dh_extract_method=dh_extraction_method,
            verbose=False,
            with_abbrev_matches=True)

        first_matches = matched['matches']
        match_indices = matched['match_word_indices']

        for i in range(1, 5):

            # let's try again, but with shorter dhs and imposing order
            start_pos = i * 2

            def dh_extraction_method_short(s):
                dh = dh_extraction_method(s)
                dh_split = re.split(r'\s+', dh)
示例#21
0
            def base_tokenizer(str):
                str = re.sub(ur"\([^\(\)]+\)", u"", str)
                word_list = re.split(ur"\s+", str)
                word_list = [w for w in word_list if w]  # remove empty strings
                return word_list


            def dh_extraction_method(str):
                m = re.match(ur"([^\.]+\.\s)?([^–]+)\s–", str)
                if m:
                    return m.group(2)
                else:
                    return ""

            yo = dibur_hamatchil_matcher.match_ref(base_tc, comment_list, base_tokenizer=base_tokenizer)
            for i,yoyo in enumerate(yo):
                if yoyo is None:
                    num_missed += 1

                if i > 0 and not yo[i-1] is None and not yoyo is None:
                    prange = yo[i-1].range_list()
                    nrange = yoyo.range_list()


                    if prange[-1] == nrange[0]:
                        print "{} is split".format(nrange[0])
                        num_split += 1

                total_sef += len(yoyo.range_list()) if yoyo else 0
                total_koren += 1
示例#22
0
    rif = Ref("Rif {}".format(mes))
    gem = Ref("{}".format(mes))
    rif_segs = rif.text("he").nonempty_subrefs()
    vtitle = 'William Davidson Edition - Aramaic' if mes in all_william else None
    gem_segs = gem.text(lang="he",vtitle=vtitle).nonempty_subrefs()

    i_rif = 0
    i_gem = 0
    last_update_was_zero = False
    while i_rif < len(rif_segs) - rif_window and i_gem < len(gem_segs) - gem_window:
        temp_rif_tc = rif_segs[i_rif].to(rif_segs[i_rif + rif_window]).text("he")
        temp_gem_tc = gem_segs[i_gem].to(gem_segs[i_gem + gem_window]).text(lang="he", vtitle=vtitle)
        print "{}, {}, {}".format(temp_rif_tc, temp_gem_tc, len(links))

        matched = dibur_hamatchil_matcher.match_ref(temp_gem_tc, temp_rif_tc, base_tokenizer=tokenize_words,
                                                    dh_extract_method=dh_extraction_method, verbose=False,
                                                    with_abbrev_matches=True)

        first_matches = matched['matches']
        match_indices = matched['match_word_indices']

        for i in range(1, 5):

            # let's try again, but with shorter dhs and imposing order
            start_pos = i * 2


            def dh_extraction_method_short(s):
                dh = dh_extraction_method(s)
                dh_split = re.split(ur'\s+', dh)
                if len(dh_split) > start_pos + 4:
    def match_multiple(self, base_patterns, split_into_base_texts, rules, dh_extraction_methods, base_tokenizers, rashi_filters, matched_dir, not_matched_dir):
        """
        This function is used when a commentary matches to multiple base texts. e.g. Maharam Shif sometimes links to Rashi, sometimes Gemara


        :param list: list of base text patterns. e.g. for Ritva the pattern would be "Ritva on". for Gemara, the battern would be "" because the mesechta is appended automatically. len() == len(rules)
        :param function split_into_base_texts: f(list[str], TextChunk) -> list, list function that takes rules and outputs which refs in the commentary should be matched to which base text. For an example implementation, look at Sefaria-Data/research/dibur_hamatchil/dh_source_scripts/gemara_commentaries/maharam_shif/maharam_shif.py
        :param list rules: list of regex to discriminate into different base texts
        :param list dh_extraction_methods: list of dh_extraction_methods. len() == len(rules
        :param list base_tokenizers: list of base_tokenizers. len() == len(rules)
        :param list rashi_filters: ditto
        :param str matched_dir:
        :param str not_matched_dir:
        :return: None
        """

        num_matched = 0
        num_searched = 0
        for mesechta in self.mes_list:
            link_list = []
            unlink_list = []
            comment = library.get_index("{} {}".format(self.commentary_pattern, mesechta))
            comment_ref_list = comment.all_section_refs()

            for icomment, comment_ref in enumerate(comment_ref_list):
                daf = comment_ref.normal_last_section()
                print u'-----{} {} Start ({}/{})-----'.format(mesechta, daf, icomment, len(comment_ref_list))
                comment_tc = TextChunk(comment_ref, "he")


                splitted, oto_dibur = split_into_base_texts(rules, comment_tc)

                for (temp_comment_refs, temp_comment_texts), temp_dh_extract, temp_base_tokenizer, temp_rashi_filter, base_pattern in zip(splitted, dh_extraction_methods, base_tokenizers, rashi_filters, base_patterns):
                    print u"--- DOING {} {} ---".format(base_pattern, mesechta)
                    temp_base_ref = Ref("{} {} {}".format(base_pattern, mesechta, daf))

                    num_refs_to_expand = 2

                    gemara_ref_before = temp_base_ref.prev_section_ref()
                    gemara_ref_after = temp_base_ref.next_section_ref()
                    if gemara_ref_before:
                        try:
                            if len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand:
                                temp_base_ref = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(temp_base_ref)
                            else:
                                temp_base_ref = gemara_ref_before.all_subrefs()[0].to(temp_base_ref)
                        except InputError:
                            pass # there was a problem extending. ignore

                    if gemara_ref_after:
                        try:
                            if len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand:
                                temp_base_ref = temp_base_ref.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1])
                            else:
                                temp_base_ref = temp_base_ref.to(gemara_ref_after.all_subrefs()[-1])
                        except InputError:
                            pass

                    temp_base_tc = temp_base_ref.text("he")
                    try:
                        matched = dibur_hamatchil_matcher.match_ref(temp_base_tc, temp_comment_texts, temp_base_tokenizer,
                                                                    dh_extract_method=temp_dh_extract, verbose=False,
                                                                    with_abbrev_matches=True,
                                                                    boundaryFlexibility=10000,
                                                                    char_threshold=0.4,
                                                                    rashi_filter=temp_rashi_filter)
                    except IndexError as e:
                        print e
                        continue
                    first_matches = matched['matches']
                    match_indices = matched['match_word_indices']

                    for i in range(1, 5):

                        # let's try again, but with shorter dhs and imposing order
                        start_pos = i * 2

                        def dh_extraction_method_short(s):
                            dh = temp_dh_extract(s)
                            dh_split = re.split(ur'\s+', dh)
                            if len(dh_split) > start_pos + 4:
                                dh = u' '.join(dh_split[start_pos:start_pos + 4])

                            return dh

                        matched = dibur_hamatchil_matcher.match_ref(temp_base_tc, temp_comment_texts, temp_base_tokenizer,
                                                                    dh_extract_method=dh_extraction_method_short,
                                                                    verbose=False,
                                                                    with_abbrev_matches=True,
                                                                    boundaryFlexibility=4,
                                                                    prev_matched_results=match_indices,
                                                                    rashi_filter=temp_rashi_filter)

                        match_indices = matched['match_word_indices']

                    matches = matched['matches']

                    ref_map = zip(matches, temp_comment_refs) # assumption that rashi_filter doesn't do anything

                    # add oto_diburs to ref_map
                    for br, cr in reversed(ref_map):
                        if str(cr) in oto_dibur:
                            oto_dibured = oto_dibur[str(cr)]
                            for od in oto_dibured:
                                ref_map += [(br, od)]

                    #TODO add super-base link if this is a super-commentary

                    temp_link_list = [[str(l[0]), str(l[1])] for l in ref_map if not l[0] is None and not l[1] is None]
                    link_list += temp_link_list
                    temp_unlink_list = [str(ul[1]) for ul in ref_map if ul[0] is None or ul[1] is None]
                    unlink_list += temp_unlink_list
                    for r in ref_map:
                        if not r[0] is None: num_matched += 1

                    num_searched += len(ref_map)

                    print "MATCHES - {}".format(ref_map)
                    for first, second in zip(first_matches, matches):
                        if first is None and not second is None:
                            print u"GOT {}".format(second)

                acc = round(1.0 * num_matched / num_searched, 5) * 100 if num_searched > 0 else 0.0
                print "ACCURACY - {}%".format(acc)
                # log.write("MATCHES - {}\n".format(temp_link_list))
                # log.write("NOT FOUND - {}\n".format(unlink_list))
                # log.write("ACCURACY - {}%\n".format(round(1.0 * num_matched / num_searched, 5) * 100))
                print u'----- {} {} End -----'.format(mesechta, daf)

            with open('{}/{}.json'.format(matched_dir, mesechta), 'wb') as out:
                json.dump(link_list, out, indent=4)

            with open('{}/{}.json'.format(not_matched_dir, mesechta), 'wb') as out:
                json.dump(unlink_list, out, indent=4)
    def match(self, dh_extraction_method, base_tokenizer, rashi_filter, matched_dir, not_matched_dir):
        """
        This function matches between a whole commentary and every one of the mesechtot it comments on
        e.g. all of Rashba against the mesechtot Rashba comments on
        It outputs json in the specified directories with the links
        :param dh_extraction_method: see dibur_hamatchil_matcher.match_ref()
        :param base_tokenizer: see dibur_hamatchil_matcher.match_ref()
        :param rashi_filter: see dibur_hamatchil_matcher.match_ref()
        :param matched_dir: directory where output of matched links will be saved
        :param not_matched_dir: directory where output of not_matched links will be saved
        :return: None
        """
        num_matched = 0
        num_searched = 0
        for mesechta in self.mes_list:
            link_list = []
            unlink_list = []
            comment = library.get_index("{} {}".format(self.commentary_pattern, mesechta))
            gemara = library.get_index("{}".format(mesechta))

            comment_ref_list = comment.all_section_refs()
            gemara_ref_list = gemara.all_section_refs()

            gemara_ind = 0
            for icomment, comment_ref in enumerate(comment_ref_list):
                while gemara_ref_list[gemara_ind].normal_last_section() != comment_ref.normal_last_section():
                    gemara_ind += 1
                gemara_ref = gemara_ref_list[gemara_ind]
                orig_gemara_ref = gemara_ref
                print u'----- {} Start ({}/{})-----'.format(orig_gemara_ref, icomment, len(comment_ref_list))
                comment_tc = TextChunk(comment_ref, "he")

                # let's extend the range of gemara_tc to account for weird rashba stuff
                num_refs_to_expand = 2

                gemara_ref_before = gemara_ref.prev_section_ref()
                gemara_ref_after = gemara_ref.next_section_ref()
                if gemara_ref_before and len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand:
                    gemara_ref_expanded = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(gemara_ref)
                if gemara_ref_after and len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand:
                    gemara_ref_expanded = gemara_ref_expanded.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1])

                vtitle = 'William Davidson Edition - Aramaic' if mesechta in self.all_william else None
                try:
                    gemara_tc = TextChunk(gemara_ref_expanded, lang='he', vtitle=vtitle)
                except Exception:
                    gemara_tc = TextChunk(gemara_ref, lang='he', vtitle=vtitle)

                matched = dibur_hamatchil_matcher.match_ref(gemara_tc, comment_tc, base_tokenizer=base_tokenizer,
                                                            dh_extract_method=dh_extraction_method, verbose=False,
                                                            with_abbrev_matches=True,
                                                            boundaryFlexibility=10000,
                                                            char_threshold=0.4,
                                                            rashi_filter=rashi_filter)

                first_matches = matched['matches']
                match_indices = matched['match_word_indices']

                for i in range(1, 5):

                    # let's try again, but with shorter dhs and imposing order
                    start_pos = i * 2

                    def dh_extraction_method_short(s):
                        dh = dh_extraction_method(s)
                        dh_split = re.split(ur'\s+', dh)
                        if len(dh_split) > start_pos + 4:
                            dh = u' '.join(dh_split[start_pos:start_pos + 4])

                        return dh

                    matched = dibur_hamatchil_matcher.match_ref(gemara_tc, comment_tc, base_tokenizer=base_tokenizer,
                                                                dh_extract_method=dh_extraction_method_short,
                                                                verbose=False,
                                                                with_abbrev_matches=True,
                                                                boundaryFlexibility=4,
                                                                prev_matched_results=match_indices,
                                                                rashi_filter=rashi_filter)

                    match_indices = matched['match_word_indices']

                if 'comment_refs' not in matched:
                    print 'NO COMMENTS'
                    continue
                matches = matched['matches']
                comment_refs = matched['comment_refs']

                ref_map = zip(matches, comment_refs)

                temp_link_list = [[str(l[0]), str(l[1])] for l in ref_map if not l[0] is None and not l[1] is None]
                link_list += temp_link_list
                temp_unlink_list = [str(ul[1]) for ul in ref_map if ul[0] is None or ul[1] is None]
                unlink_list += temp_unlink_list
                for r in ref_map:
                    if not r[0] is None: num_matched += 1

                num_searched += len(ref_map)

                print "MATCHES - {}".format(ref_map)
                for first, second in zip(first_matches, matches):
                    if first is None and not second is None:
                        print u"GOT {}".format(second)

                print "ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100)
                # log.write("MATCHES - {}\n".format(temp_link_list))
                # log.write("NOT FOUND - {}\n".format(unlink_list))
                # log.write("ACCURACY - {}%\n".format(round(1.0 * num_matched / num_searched, 5) * 100))
                print u'----- {} End -----'.format(orig_gemara_ref)

            with open('{}/{}.json'.format(matched_dir, mesechta), 'wb') as out:
                json.dump(link_list, out, indent=4)

            with open('{}/{}.json'.format(not_matched_dir, mesechta), 'wb') as out:
                json.dump(unlink_list, out, indent=4)
示例#25
0
        curr_super_word = 0
        while curr_word < len(words_scraped):
            possible_range = range(curr_word, curr_word+chunk_size)
            if curr_super_word < len(super_word_list) and super_word_list[curr_super_word] - 1 in possible_range:
                comments_scraped += [u" ".join(words_scraped[curr_word:super_word_list[curr_super_word]])]
                curr_word = super_word_list[curr_super_word]
            elif curr_super_word < len(super_word_list) and super_word_list[curr_super_word] in possible_range:
                temp_chunk = min(super_word_list[curr_super_word+1] - curr_word, super_chunk_size) if curr_super_word + 1 < len(super_word_list) else super_chunk_size
                comments_scraped += [u" ".join(words_scraped[curr_word:curr_word + temp_chunk])]
                super_comment_list += [len(comments_scraped)-1]
                curr_word += temp_chunk
                curr_super_word += 1
            else:
                comments_scraped += [u" ".join(words_scraped[curr_word:curr_word + chunk_size])]
                curr_word += chunk_size
        matched = dibur_hamatchil_matcher.match_ref(daf_ref.text("he"), comments_scraped, base_tokenizer,  with_abbrev_matches=True, with_num_abbrevs=False, place_consecutively=False)

        try:
            super_sefaria_list = [matched["matches"][sc].normal() for sc in super_comment_list]
        except AttributeError:
            print "OH NO!", daf_ref, matched["matches"]
            super_sefaria_list = [matched["matches"][sc].normal() if matched["matches"][sc] is not None else None for sc in super_comment_list]

        wiki_snippets = [u" <{}> ".format(bleach.clean(sim, strip=True, tags=[])).join([comments_scraped[sc-1],
                         comments_scraped[sc]]) if sc != 0 else u" <{}> {}".format(bleach.clean(sim, strip=True, tags=[]
                         ), comments_scraped[sc]) for sim, sc in zip(daf_scraped["super_simanim"], super_comment_list)]
        out_rows += [
            {"Index": ind,
             "Daf": daf_ref.normal(),
             "Siman": bleach.clean(sim, strip=True, tags=[]),
             "Siman Ref": Ref(sim_ref).starting_ref().normal() if sim_ref is not None else u"N/A",
示例#26
0
    def match(self, dh_extraction_method, base_tokenizer, rashi_filter, matched_dir, not_matched_dir):
        """
        This function matches between a whole commentary and every one of the mesechtot it comments on
        e.g. all of Rashba against the mesechtot Rashba comments on
        It outputs json in the specified directories with the links
        :param dh_extraction_method: see dibur_hamatchil_matcher.match_ref()
        :param base_tokenizer: see dibur_hamatchil_matcher.match_ref()
        :param rashi_filter: see dibur_hamatchil_matcher.match_ref()
        :param matched_dir: directory where output of matched links will be saved
        :param not_matched_dir: directory where output of not_matched links will be saved
        :return: None
        """
        num_matched = 0
        num_searched = 0
        for mesechta in self.mes_list:
            link_list = []
            unlink_list = []
            comment = library.get_index("{} {}".format(self.commentary_pattern, mesechta))
            gemara = library.get_index("{}".format(mesechta))

            comment_ref_list = comment.all_section_refs()
            gemara_ref_list = gemara.all_section_refs()

            gemara_ind = 0
            for icomment, comment_ref in enumerate(comment_ref_list):

                # set gemara ref to the same daf as our comment_ref
                while gemara_ref_list[gemara_ind].normal_last_section() != comment_ref.normal_last_section():
                    gemara_ind += 1
                gemara_ref = gemara_ref_list[gemara_ind]

                orig_gemara_ref = gemara_ref
                print('----- {} Start ({}/{})-----'.format(orig_gemara_ref, icomment, len(comment_ref_list)))
                comment_tc = TextChunk(comment_ref, "he")

                # let's extend the range of gemara_tc to account for weird rashba stuff
                num_refs_to_expand = 2

                gemara_ref_before = gemara_ref.prev_section_ref()
                gemara_ref_after = gemara_ref.next_section_ref()
                gemara_ref_expanded = gemara_ref

                if gemara_ref_before and len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand:
                    gemara_ref_expanded = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(gemara_ref)
                if gemara_ref_after and len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand:
                    gemara_ref_expanded = gemara_ref_expanded.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1])

                vtitle = 'William Davidson Edition - Aramaic' if mesechta in self.all_william else None
                gemara_tc = TextChunk(gemara_ref_expanded, lang='he', vtitle=vtitle)

                matched = dibur_hamatchil_matcher.match_ref(gemara_tc, comment_tc, base_tokenizer=base_tokenizer,
                                                            dh_extract_method=dh_extraction_method, verbose=False,
                                                            with_abbrev_matches=True,
                                                            boundaryFlexibility=10000,
                                                            char_threshold=0.4,
                                                            rashi_filter=rashi_filter)

                first_matches = matched['matches']
                match_indices = matched['match_word_indices']

                for i in range(1, 5):

                    # let's try again, but with shorter dhs and imposing order
                    start_pos = i * 2

                    def dh_extraction_method_short(s):
                        dh = dh_extraction_method(s)
                        dh_split = re.split(r'\s+', dh)
                        if len(dh_split) > start_pos + 4:
                            dh = ' '.join(dh_split[start_pos:start_pos + 4])

                        return dh

                    matched = dibur_hamatchil_matcher.match_ref(gemara_tc, comment_tc, base_tokenizer=base_tokenizer,
                                                                dh_extract_method=dh_extraction_method_short,
                                                                verbose=False,
                                                                with_abbrev_matches=True,
                                                                boundaryFlexibility=4,
                                                                prev_matched_results=match_indices,
                                                                rashi_filter=rashi_filter)

                    match_indices = matched['match_word_indices']

                if 'comment_refs' not in matched:
                    print('NO COMMENTS')
                    continue
                matches = matched['matches']
                comment_refs = matched['comment_refs']

                ref_map = list(zip(matches, comment_refs))

                temp_link_list = [[str(l[0]), str(l[1])] for l in ref_map if not l[0] is None and not l[1] is None]
                link_list += temp_link_list
                temp_unlink_list = [str(ul[1]) for ul in ref_map if ul[0] is None or ul[1] is None]
                unlink_list += temp_unlink_list
                for r in ref_map:
                    if not r[0] is None: num_matched += 1

                num_searched += len(ref_map)

                print("MATCHES - {}".format(ref_map))
                for first, second in zip(first_matches, matches):
                    if first is None and not second is None:
                        print("GOT {}".format(second))

                print("ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100))
                # log.write("MATCHES - {}\n".format(temp_link_list))
                # log.write("NOT FOUND - {}\n".format(unlink_list))
                # log.write("ACCURACY - {}%\n".format(round(1.0 * num_matched / num_searched, 5) * 100))
                print('----- {} End -----'.format(orig_gemara_ref))

            with open('{}/{}.json'.format(matched_dir, mesechta), 'wb') as out:
                json.dump(link_list, out, indent=4)

            with open('{}/{}.json'.format(not_matched_dir, mesechta), 'wb') as out:
                json.dump(unlink_list, out, indent=4)
示例#27
0
    def match_multiple(self, base_patterns, split_into_base_texts, rules, dh_extraction_methods, base_tokenizers, rashi_filters, matched_dir, not_matched_dir):
        """
        This function is used when a commentary matches to multiple base texts. e.g. Maharam Shif sometimes links to Rashi, sometimes Gemara


        :param list: list of base text patterns. e.g. for Ritva the pattern would be "Ritva on". for Gemara, the battern would be "" because the mesechta is appended automatically. len() == len(rules)
        :param function split_into_base_texts: f(list[str], TextChunk) -> list, list function that takes rules and outputs which refs in the commentary should be matched to which base text. For an example implementation, look at Sefaria-Data/research/dibur_hamatchil/dh_source_scripts/gemara_commentaries/maharam_shif/maharam_shif.py
        :param list rules: list of regex to discriminate into different base texts
        :param list dh_extraction_methods: list of dh_extraction_methods. len() == len(rules
        :param list base_tokenizers: list of base_tokenizers. len() == len(rules)
        :param list rashi_filters: ditto
        :param str matched_dir:
        :param str not_matched_dir:
        :return: None
        """

        num_matched = 0
        num_searched = 0
        for mesechta in self.mes_list:
            link_list = []
            unlink_list = []
            comment = library.get_index("{} {}".format(self.commentary_pattern, mesechta))
            comment_ref_list = comment.all_section_refs()

            for icomment, comment_ref in enumerate(comment_ref_list):
                daf = comment_ref.normal_last_section()
                print('-----{} {} Start ({}/{})-----'.format(mesechta, daf, icomment, len(comment_ref_list)))
                comment_tc = TextChunk(comment_ref, "he")


                splitted, oto_dibur = split_into_base_texts(rules, comment_tc)

                for (temp_comment_refs, temp_comment_texts), temp_dh_extract, temp_base_tokenizer, temp_rashi_filter, base_pattern in zip(splitted, dh_extraction_methods, base_tokenizers, rashi_filters, base_patterns):
                    print("--- DOING {} {} ---".format(base_pattern, mesechta))
                    temp_base_ref = Ref("{} {} {}".format(base_pattern, mesechta, daf))

                    num_refs_to_expand = 2

                    gemara_ref_before = temp_base_ref.prev_section_ref()
                    gemara_ref_after = temp_base_ref.next_section_ref()
                    if gemara_ref_before:
                        try:
                            if len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand:
                                temp_base_ref = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(temp_base_ref)
                            else:
                                temp_base_ref = gemara_ref_before.all_subrefs()[0].to(temp_base_ref)
                        except InputError:
                            pass # there was a problem extending. ignore

                    if gemara_ref_after:
                        try:
                            if len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand:
                                temp_base_ref = temp_base_ref.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1])
                            else:
                                temp_base_ref = temp_base_ref.to(gemara_ref_after.all_subrefs()[-1])
                        except InputError:
                            pass

                    temp_base_tc = temp_base_ref.text("he")
                    try:
                        matched = dibur_hamatchil_matcher.match_ref(temp_base_tc, temp_comment_texts, temp_base_tokenizer,
                                                                    dh_extract_method=temp_dh_extract, verbose=False,
                                                                    with_abbrev_matches=True,
                                                                    boundaryFlexibility=10000,
                                                                    char_threshold=0.4,
                                                                    rashi_filter=temp_rashi_filter)
                    except IndexError as e:
                        print(e)
                        continue
                    first_matches = matched['matches']
                    match_indices = matched['match_word_indices']

                    for i in range(1, 5):

                        # let's try again, but with shorter dhs and imposing order
                        start_pos = i * 2

                        def dh_extraction_method_short(s):
                            dh = temp_dh_extract(s)
                            dh_split = re.split(r'\s+', dh)
                            if len(dh_split) > start_pos + 4:
                                dh = ' '.join(dh_split[start_pos:start_pos + 4])

                            return dh

                        matched = dibur_hamatchil_matcher.match_ref(temp_base_tc, temp_comment_texts, temp_base_tokenizer,
                                                                    dh_extract_method=dh_extraction_method_short,
                                                                    verbose=False,
                                                                    with_abbrev_matches=True,
                                                                    boundaryFlexibility=4,
                                                                    prev_matched_results=match_indices,
                                                                    rashi_filter=temp_rashi_filter)

                        match_indices = matched['match_word_indices']

                    matches = matched['matches']

                    ref_map = list(zip(matches, temp_comment_refs)) # assumption that rashi_filter doesn't do anything

                    # add oto_diburs to ref_map
                    for br, cr in reversed(ref_map):
                        if str(cr) in oto_dibur:
                            oto_dibured = oto_dibur[str(cr)]
                            for od in oto_dibured:
                                ref_map += [(br, od)]

                    #TODO add super-base link if this is a super-commentary

                    temp_link_list = [[str(l[0]), str(l[1])] for l in ref_map if not l[0] is None and not l[1] is None]
                    link_list += temp_link_list
                    temp_unlink_list = [str(ul[1]) for ul in ref_map if ul[0] is None or ul[1] is None]
                    unlink_list += temp_unlink_list
                    for r in ref_map:
                        if not r[0] is None: num_matched += 1

                    num_searched += len(ref_map)

                    print("MATCHES - {}".format(ref_map))
                    for first, second in zip(first_matches, matches):
                        if first is None and not second is None:
                            print("GOT {}".format(second))

                acc = round(1.0 * num_matched / num_searched, 5) * 100 if num_searched > 0 else 0.0
                print("ACCURACY - {}%".format(acc))
                # log.write("MATCHES - {}\n".format(temp_link_list))
                # log.write("NOT FOUND - {}\n".format(unlink_list))
                # log.write("ACCURACY - {}%\n".format(round(1.0 * num_matched / num_searched, 5) * 100))
                print('----- {} {} End -----'.format(mesechta, daf))

            with open('{}/{}.json'.format(matched_dir, mesechta), 'wb') as out:
                json.dump(link_list, out, indent=4)

            with open('{}/{}.json'.format(not_matched_dir, mesechta), 'wb') as out:
                json.dump(unlink_list, out, indent=4)
示例#28
0
                    curr_word, super_chunk_size) if curr_super_word + 1 < len(
                        super_word_list) else super_chunk_size
                comments_scraped += [
                    u" ".join(words_scraped[curr_word:curr_word + temp_chunk])
                ]
                super_comment_list += [len(comments_scraped) - 1]
                curr_word += temp_chunk
                curr_super_word += 1
            else:
                comments_scraped += [
                    u" ".join(words_scraped[curr_word:curr_word + chunk_size])
                ]
                curr_word += chunk_size
        matched = dibur_hamatchil_matcher.match_ref(daf_ref.text("he"),
                                                    comments_scraped,
                                                    base_tokenizer,
                                                    with_abbrev_matches=True,
                                                    with_num_abbrevs=False,
                                                    place_consecutively=False)

        try:
            super_sefaria_list = [
                matched["matches"][sc].normal() for sc in super_comment_list
            ]
        except AttributeError:
            print "OH NO!", daf_ref, matched["matches"]
            super_sefaria_list = [
                matched["matches"][sc].normal()
                if matched["matches"][sc] is not None else None
                for sc in super_comment_list
            ]