def match(): link_list = [] num_matched = 0 num_searched = 0 rashba = Ref("Rashi on Berakhot") gemara = library.get_index("Berakhot") rashba_ref_list = [ref for ref in rashba.all_subrefs() if ref.text('he').text != []] gemara_ref_list = gemara.all_section_refs() gemara_ind = 0 for rashba_ref in rashba_ref_list: while gemara_ind < len(gemara_ref_list) and gemara_ref_list[gemara_ind].normal_last_section() != rashba_ref.normal_last_section(): gemara_ind += 1 gemara_ref = rashba_ref_list[gemara_ind] rashba_tc = TextChunk(rashba_ref,"he") gemara_tc = TextChunk(gemara_ref,"he") ref_map_with_abbrevs = dibur_hamatchil_matcher.match_ref(rashba_tc, gemara_tc,base_tokenizer=tokenize_words, dh_extract_method=dh_extraction_method, verbose=True, with_abbrev_matches=True) ref_map = [(tup[0], tup[1]) for tup in ref_map_with_abbrevs] temp_link_list = [l for l in ref_map if not l[0] is None and not l[1] is None] link_list += temp_link_list unlink_list = [ul[1] for ul in ref_map if ul[0] is None or ul[1] is None] for r in ref_map: if not r[0] is None: num_matched += 1 num_searched += len(ref_map) print "MATCHES - {}".format(ref_map) print "ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100)
def find_subref(sheet_text, ref, lang, vtitle=None, tried_adding_refs_at_end_of_section=False, **kwargs): try: tc = TextChunk(ref, lang, vtitle=vtitle) matches = match_ref(tc, [sheet_text], tokenizer, dh_extract_method=clean, with_num_abbrevs=False, lang=lang, rashi_skips=2, dh_split=lambda dh: re.split(ur"\s*\.\.\.\s*", dh), **kwargs) except IndexError: # thrown if base text is empty matches = {"matches": []} except ValueError: matches = {"matches": []} found_ref = None for r in matches["matches"]: if r is not None: found_ref = r break if found_ref is None: if ref.primary_category == "Tanakh" and lang == "en" and vtitle is None: return find_subref(sheet_text, ref, lang, "The Holy Scriptures: A New Translation (JPS 1917)") elif ref.primary_category == "Talmud" and vtitle is None: if lang == "he": return find_subref(sheet_text, ref, lang, "Wikisource Talmud Bavli") else: return find_subref(sheet_text, ref, lang, "Sefaria Community Translation") elif ref.primary_category == "Talmud" and ref.is_section_level() and not tried_adding_refs_at_end_of_section: # you tried wiki and it didn't work # you're running out of options, what do you do? # add first and last seg from prev and next daf!!! prev_daf = ref.prev_section_ref() next_daf = ref.next_section_ref() start_ref = prev_daf.all_segment_refs()[-1] if prev_daf is not None else ref end_ref = next_daf.all_segment_refs()[0] if next_daf is not None else ref if end_ref.is_range(): end_ref = end_ref.ending_ref() new_ref = start_ref.to(end_ref) return find_subref(sheet_text, new_ref, lang, tried_adding_refs_at_end_of_section=True) return found_ref
def find_subref(sheet_text, ref, lang, vtitle=None, tried_adding_refs_at_end_of_section=False, **kwargs): try: tc = TextChunk(ref, lang, vtitle=vtitle) matches = match_ref(tc, [sheet_text], tokenizer, dh_extract_method=clean, with_num_abbrevs=False, lang=lang, rashi_skips=2, dh_split=lambda dh: re.split(r"\s*\.\.\.\s*", dh), **kwargs) except IndexError: # thrown if base text is empty matches = {"matches": []} except ValueError: matches = {"matches": []} found_ref = None for r in matches["matches"]: if r is not None: found_ref = r break if found_ref is None: if ref.primary_category == "Tanakh" and lang == "en" and vtitle is None: return find_subref(sheet_text, ref, lang, "The Holy Scriptures: A New Translation (JPS 1917)") elif ref.primary_category == "Talmud" and vtitle is None: if lang == "he": return find_subref(sheet_text, ref, lang, "Wikisource Talmud Bavli") else: return find_subref(sheet_text, ref, lang, "Sefaria Community Translation") elif ref.primary_category == "Talmud" and ref.is_section_level() and not tried_adding_refs_at_end_of_section: # you tried wiki and it didn't work # you're running out of options, what do you do? # add first and last seg from prev and next daf!!! prev_daf = ref.prev_section_ref() next_daf = ref.next_section_ref() start_ref = prev_daf.all_segment_refs()[-1] if prev_daf is not None else ref end_ref = next_daf.all_segment_refs()[0] if next_daf is not None else ref if end_ref.is_range(): end_ref = end_ref.ending_ref() new_ref = start_ref.to(end_ref) return find_subref(sheet_text, new_ref, lang, tried_adding_refs_at_end_of_section=True) return found_ref
def get_rabbi_mention_segments(rows_by_mas, limit=None): total = 0 missed = 0 new_rows = [] indexes = library.get_indexes_in_category("Bavli") if limit is None else limit for mas in tqdm(indexes): for i, amud in enumerate(rows_by_mas[mas]): curr_amud = amud[0][' Amud'].lower() tc = Ref(f"{mas} {curr_amud}").text("he", vtitle="William Davidson Edition - Aramaic") matches = match_ref(tc, [r[' Snippet'] for r in amud], base_tokenizer, dh_extract_method=dh_extract_method, with_num_abbrevs=False, place_all=True, place_consecutively=True, verbose=False) total+=len(matches['matches']) rabbi_match_segs = [] for j, m in enumerate(matches['matches']): snippet = amud[j][' Snippet'] rabbi_match_segs += [get_rabbi_seg(m, snippet)] if m is None: missed += 1 for j, r in enumerate(amud): seg, context = rabbi_match_segs[j] new_rows += [{ "Segment": None if seg is None else seg.normal(), "Context": context, "Book": mas, "Bonayich ID": r[" Rabbi ID after Link"] }] print(missed, total) return new_rows
def match(): mb = library.get_index("Mishnah Berurah") oc = library.get_index("Shulchan Arukh, Orach Chayim") mbRefList = mb.all_section_refs() ocRefList = oc.all_section_refs() mbInd = 0 num_matched = 0 num_searched = 0 link_list = [] log = open("mishnah_berurah.log","w") rt_log = open("rashei_tevot.csv","w") rt_log_csv = unicodecsv.DictWriter(rt_log, fieldnames=["abbrev","expanded","context_before","context_after"]) rt_log_csv.writeheader() for ocRef in ocRefList: ocSiman = getSimanNum(ocRef) while getSimanNum(mbRefList[mbInd]) != ocSiman: mbInd += 1 mbRef = mbRefList[mbInd] mbSiman = getSimanNum(mbRef) print "----- SIMAN {} -----".format(ocSiman) log.write("----- SIMAN {} -----\n".format(ocSiman)) octc = TextChunk(ocRef,"he") mbtc = TextChunk(mbRef,"he") try: matched = dibur_hamatchil_matcher.match_ref(octc,mbtc,base_tokenizer=base_tokenizer,dh_extract_method=dh_extraction_method,verbose=True,with_abbrev_matches=True,rashi_filter=rashi_filter) except ValueError: continue if 'comment_refs' not in matched: continue ref_map = [(base,comment) for base,comment in zip(matched['matches'],matched['comment_refs'])] abbrevs = [am for seg in matched['abbrevs'] for am in seg] for am in abbrevs: rt_log_csv.writerow({'abbrev':dibur_hamatchil_matcher.cleanAbbrev(am.abbrev), 'expanded':u' '.join(am.expanded), 'context_before':u' '.join(am.contextBefore), 'context_after':u' '.join(am.contextAfter)}) temp_link_list = [l for l in ref_map if not l[0] is None and not l[1] is None] link_list += temp_link_list unlink_list = [ul[1] for ul in ref_map if ul[0] is None or ul[1] is None] for r in ref_map: if not r[0] is None: num_matched+=1 num_searched += len(ref_map) print "MATCHES - {}".format(ref_map) print "ACCURACY - {}%".format(round(1.0*num_matched/num_searched,5)*100) log.write("MATCHES - {}\n".format(temp_link_list)) log.write("NOT FOUND - {}\n".format(unlink_list)) log.write("ACCURACY - {}%\n".format(round(1.0*num_matched/num_searched,5)*100)) doc = {"link_list":[[link[0].normal(),link[1].normal()] for link in link_list]} fp = codecs.open("mishnah_berurah_links.json", "w",encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False) fp.close() log.close() rt_log.close()
def match(): mb = library.get_index("Mishnah Berurah") oc = library.get_index("Shulchan Arukh, Orach Chayim") mbRefList = mb.all_section_refs() ocRefList = oc.all_section_refs() mbInd = 0 num_matched = 0 num_searched = 0 link_list = [] log = open("mishnah_berurah.log", "w") for ocRef in ocRefList: ocSiman = getSimanNum(ocRef) while getSimanNum(mbRefList[mbInd]) != ocSiman: mbInd += 1 mbRef = mbRefList[mbInd] mbSiman = getSimanNum(mbRef) print "----- SIMAN {} -----".format(ocSiman) log.write("----- SIMAN {} -----\n".format(ocSiman)) octc = TextChunk(ocRef, "he") mbtc = TextChunk(mbRef, "he") ref_map = dibur_hamatchil_matcher.match_ref( octc, mbtc, base_tokenizer=base_tokenizer, dh_extract_method=dh_extraction_method, verbose=True) temp_link_list = [ l for l in ref_map if not l[0] is None and not l[1] is None ] link_list += temp_link_list unlink_list = [ ul[1] for ul in ref_map if ul[0] is None or ul[1] is None ] for r in ref_map: if not r[0] is None: num_matched += 1 num_searched += len(ref_map) print "MATCHES - {}".format(ref_map) print "ACCURACY - {}%".format( round(1.0 * num_matched / num_searched, 5) * 100) log.write("MATCHES - {}\n".format(temp_link_list)) log.write("NOT FOUND - {}\n".format(unlink_list)) log.write("ACCURACY - {}%\n".format( round(1.0 * num_matched / num_searched, 5) * 100)) doc = { "link_list": [[link[0].normal(), link[1].normal()] for link in link_list] } fp = codecs.open("mishnah_berurah_links.json", "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False) fp.close() log.close()
def test_dh_matcher(self): from data_utilities.dibur_hamatchil_matcher import match_ref base_tokenizer = lambda x: x.split() base_text = Ref(u'גיטין ז').text('he') n = 100 # len(''.join(base_text.text).split()) comments = [u'צריך @33 למימרינהו בניחותא.', u'מי שיש לו צעקת לגימא על חבירו ודומם שוכן בסנה עושה לו דין. ', u'@11הדבר @33 יצא מפי רבי אלעזר ונתנוהו לגניבה בקולר. ', u'א“ל @33 האלהים מדרבנן אלא חסדא שמך וחסדאין מילך. ', u'מאי @33 זאת לא זאת. ', u'''דרש @33 רב עוירא וכו' מ“ד כה אמר ה' אם שלמים וכן רבים וכו' אם רואה אדם שמזונותיו מצומצמין יעשה מהן צדקה וכ“ש כשהן מרובין וכו‘.'''] results = match_ref(base_text, comments, base_tokenizer, prev_matched_results=None, dh_extract_method=lambda x: x,verbose=False, word_threshold=0.27,char_threshold=0.2, with_abbrev_matches=False,with_num_abbrevs=True,boundaryFlexibility=n,dh_split=None, rashi_filter=None, strict_boundaries=None, place_all=False, create_ranges=False, place_consecutively=False, daf_skips=2, rashi_skips=1, overall=2, lang="he") print results['matches']
def match(): link_list = [] num_matched = 0 num_searched = 0 rashba = library.get_index("Rashba on Berakhot") gemara = library.get_index("Berakhot") rashba_ref_list = rashba.all_section_refs() gemara_ref_list = gemara.all_section_refs() gemara_ind = 0 for rashba_ref in rashba_ref_list: while gemara_ref_list[gemara_ind].normal_last_section() != rashba_ref.normal_last_section(): gemara_ind += 1 gemara_ref = gemara_ref_list[gemara_ind] rashba_tc = TextChunk(rashba_ref,"he") # let's extend the range of gemara_tc to account for weird rashba stuff num_refs_to_expand = 2 gemara_ref_before = gemara_ref.prev_section_ref() gemara_ref_after = gemara_ref.next_section_ref() if gemara_ref_before and len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand: gemara_ref = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(gemara_ref) if gemara_ref_after and len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand: gemara_ref = gemara_ref.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1]) gemara_tc = TextChunk(gemara_ref,"he") ref_map_with_abbrevs = dibur_hamatchil_matcher.match_ref(gemara_tc, rashba_tc, base_tokenizer=tokenize_words, dh_extract_method=dh_extraction_method, verbose=True, with_abbrev_matches=True,dh_split=dh_split, boundaryFlexibility=10000, rashi_filter=rashi_filter) ref_map = [(tup[0], tup[1]) for tup in ref_map_with_abbrevs] temp_link_list = [l for l in ref_map if not l[0] is None and not l[1] is None] link_list += temp_link_list unlink_list = [ul[1] for ul in ref_map if ul[0] is None or ul[1] is None] for r in ref_map: if not r[0] is None: num_matched += 1 num_searched += len(ref_map) print "MATCHES - {}".format(ref_map) print "ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100)
def test_mb(self): simanim = [1, 51, 202] all_matched = [] for sim in simanim: ocRef = Ref('Shulchan Arukh, Orach Chayim {}'.format(sim)) mbRef = Ref('Mishnah Berurah {}'.format(sim)) octc = TextChunk(ocRef,"he") mbtc = TextChunk(mbRef,"he") matched = dhm.match_ref(octc, mbtc, base_tokenizer=mb_base_tokenizer, dh_extract_method=mb_dh_extraction_method, with_abbrev_matches=True) matched['abbrevs'] = [[unicode(am) for am in seg] for seg in matched['abbrevs']] all_matched.append(matched) #pickle.dump(all_matched, open('mb_matched.pkl','wb')) comparison = pickle.load(open('mb_matched.pkl', 'rb')) #comparison = [comparison[1]] assert comparison == all_matched
def match(self, base_ref, comment_ref, verbose=False): assert isinstance(base_ref, Ref) assert isinstance(comment_ref, Ref) base_version = "Tanach with Text Only" mei_version = 'Divrei Emet, Zalkowa 1801' word_count = base_ref.text('he', base_version).word_count() return match_ref(base_ref.text('he', base_version), comment_ref.text('he', mei_version), self._base_tokenizer, dh_extract_method=self._dh_extract_method, verbose=verbose, rashi_filter=self._filter, char_threshold=0.4, boundaryFlexibility=word_count)
def match(): link_list = [] num_matched = 0 num_searched = 0 rashba = Ref("Rashi on Berakhot") gemara = library.get_index("Berakhot") rashba_ref_list = [ref for ref in rashba.all_subrefs() if ref.text("he").text != []] gemara_ref_list = gemara.all_section_refs() gemara_ind = 0 for rashba_ref in rashba_ref_list: while ( gemara_ind < len(gemara_ref_list) and gemara_ref_list[gemara_ind].normal_last_section() != rashba_ref.normal_last_section() ): gemara_ind += 1 gemara_ref = rashba_ref_list[gemara_ind] rashba_tc = TextChunk(rashba_ref, "he") gemara_tc = TextChunk(gemara_ref, "he") ref_map_with_abbrevs = dibur_hamatchil_matcher.match_ref( rashba_tc, gemara_tc, base_tokenizer=tokenize_words, dh_extract_method=dh_extraction_method, verbose=True, with_abbrev_matches=True, ) ref_map = [(tup[0], tup[1]) for tup in ref_map_with_abbrevs] temp_link_list = [l for l in ref_map if not l[0] is None and not l[1] is None] link_list += temp_link_list unlink_list = [ul[1] for ul in ref_map if ul[0] is None or ul[1] is None] for r in ref_map: if not r[0] is None: num_matched += 1 num_searched += len(ref_map) print "MATCHES - {}".format(ref_map) print "ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100)
def link_drashot(): matched=0.00 total=0.00 errored = [] not_machted = [] start_parsha_parse = False for parsha in eng_parshiot: if "B" in parsha: start_parsha_parse=True if start_parsha_parse and "Miketz" not in parsha and "Pekudei" not in parsha: parsha_chunk = TextChunk(Ref("Parashat "+parsha),"he","Tanach with Text Only") bih_chunk = TextChunk(Ref('Ben Ish Hai, Drashot, '+parsha),"he","NEW VERSION") word_count = parsha_chunk.word_count() bih_links = match_ref(parsha_chunk,bih_chunk,base_tokenizer,dh_extract_method=dh_extract_method,verbose=True,rashi_filter=_filter, boundaryFlexibility=word_count-1, char_threshold=1.8) for base, comment in zip(bih_links["matches"],bih_links["comment_refs"]): print "B",base,"C", comment print bih_links.get('refs') if base: link = ( { "refs": [ base.normal(), comment.normal(), ], "type": "commentary", "auto": True, "generated_by": "sterling_ben_ish_hai_linker" }) post_link(link, weak_network=True) matched+=1 #if there is no match and there is only one comment, default will be to link it to that comment else: not_machted.append(parsha) total+=1 if total!=0: pm = matched/total print "Percent matched: "+str(pm) else: print "None matched :(" print "Not Matched:" for nm in not_machted: print nm
def match(self, base_ref, comment_ref, verbose=False): assert isinstance(base_ref, Ref) assert isinstance(comment_ref, Ref) if self.version_map.get(base_ref.book) is None: self.set_version_by_category( base_ref.book ) # Books that can't be linked by category need to be set manually base_version, mei_version = self.version_map[ base_ref.book], self.version_map[comment_ref.index.title] word_count = base_ref.text('he', base_version).word_count() return match_ref(base_ref.text('he', base_version), comment_ref.text('he', mei_version), self._base_tokenizer, dh_extract_method=self._dh_extract_method, verbose=verbose, rashi_filter=self._filter, char_threshold=0.4, boundaryFlexibility=word_count)
def test_mb(self): simanim = [1, 51, 202] all_matched = [] for sim in simanim: ocRef = Ref('Shulchan Arukh, Orach Chayim {}'.format(sim)) mbRef = Ref('Mishnah Berurah {}'.format(sim)) octc = TextChunk(ocRef,"he") mbtc = TextChunk(mbRef,"he") matched = dhm.match_ref(octc, mbtc, base_tokenizer=mb_base_tokenizer, dh_extract_method=mb_dh_extraction_method, with_abbrev_matches=True) # store dict in json serializable format matched[u'abbrevs'] = [[unicode(am) for am in seg] for seg in matched[u'abbrevs']] matched[u'comment_refs'] = [unicode(r.normal()) if r is not None else r for r in matched[u'comment_refs']] matched[u'matches'] = [r.normal() if r is not None else r for r in matched[u'matches']] matched[u'match_word_indices'] = [list(tup) for tup in matched[u'match_word_indices']] matched[u'match_text'] = [list(tup) for tup in matched[u'match_text']] all_matched.append(matched) #json.dump(all_matched, codecs.open('mb_matched.json', 'wb', encoding='utf8')) comparison = json.load(codecs.open('mb_matched.json', 'rb', encoding='utf8')) for a_siman, b_siman in zip(all_matched, comparison): for k, v in a_siman.items(): assert v == b_siman[k]
def get_matches_for_dict_and_link(dh_dict, base_text_title, commentary_title, talmud=True, lang='he', word_threshold=0.27, server="", rashi_filter=None, dh_extract_method=lambda x: x): def base_tokenizer(str): str_list = str.split(" ") return [str for str in str_list if len(str) > 0] assert len(server) > 0, "Please specify a server" results = {} links = [] matched = 0 total = 0 for daf in dh_dict: print daf dhs = dh_dict[daf] if talmud: base_text_ref = "{} {}".format(base_text_title, AddressTalmud.toStr("en", daf)) comm_ref = "{} on {} {}".format(commentary_title, base_text_title, AddressTalmud.toStr("en", daf)) else: base_text_ref = "{} {}".format(base_text_title, daf) comm_ref = "{} on {} {}".format(commentary_title, base_text_title, daf) base_text = TextChunk(Ref(base_text_ref), lang=lang) comm_text = TextChunk(Ref(comm_ref), lang=lang) results[daf] = match_ref(base_text, comm_text, base_tokenizer=base_tokenizer, word_threshold=word_threshold, rashi_filter=rashi_filter, dh_extract_method=dh_extract_method)["matches"] for count, link in enumerate(results[daf]): if link: base_end = link.normal() comm_end = "{} on {} {}:{}".format(commentary_title, base_text_title, AddressTalmud.toStr("en", daf), count+1) links.append({ "refs": [base_end, comm_end], "auto": True, "type": "commentary", "generated_by": commentary_title+base_text_title }) matched += 1 total += 1 print "Matched: {}".format(matched) print "Total {}".format(total) post_link(links, server=server) return results
def test_mb(self): simanim = [1, 51, 202] all_matched = [] for sim in simanim: ocRef = Ref('Shulchan Arukh, Orach Chayim {}'.format(sim)) mbRef = Ref('Mishnah Berurah {}'.format(sim)) octc = TextChunk(ocRef, "he") mbtc = TextChunk(mbRef, "he") matched = dhm.match_ref(octc, mbtc, base_tokenizer=mb_base_tokenizer, dh_extract_method=mb_dh_extraction_method, with_abbrev_matches=True) # store dict in json serializable format matched['abbrevs'] = [[str(am) for am in seg] for seg in matched['abbrevs']] matched['comment_refs'] = [ str(r.normal()) if r is not None else r for r in matched['comment_refs'] ] matched['matches'] = [ r.normal() if r is not None else r for r in matched['matches'] ] matched['match_word_indices'] = [ list(tup) for tup in matched['match_word_indices'] ] matched['match_text'] = [ list(tup) for tup in matched['match_text'] ] all_matched.append(matched) #json.dump(all_matched, codecs.open('mb_matched.json', 'wb', encoding='utf8')) comparison = json.load( codecs.open('mb_matched.json', 'rb', encoding='utf8')) for a_siman, b_siman in zip(all_matched, comparison): for k, v in list(a_siman.items()): assert v == b_siman[k]
break def base_tokenizer(str): str = re.sub(r"\([^\(\)]+\)", "", str) word_list = re.split(r"\s+", str) word_list = [w for w in word_list if w] # remove empty strings return word_list def dh_extraction_method(str): m = re.match(r"([^\.]+\.\s)?([^–]+)\s–", str) if m: return m.group(2) else: return "" yo = dibur_hamatchil_matcher.match_ref( base_tc, comment_list, base_tokenizer=base_tokenizer) for i, yoyo in enumerate(yo): if yoyo is None: num_missed += 1 if i > 0 and not yo[i - 1] is None and not yoyo is None: prange = yo[i - 1].range_list() nrange = yoyo.range_list() if prange[-1] == nrange[0]: print("{} is split".format(nrange[0])) num_split += 1 total_sef += len(yoyo.range_list()) if yoyo else 0 total_koren += 1
def match(): mes = "Sanhedrin" yr = library.get_index("Yad Ramah on {}".format(mes)) gem = library.get_index("{}".format(mes)) yrRefList = yr.all_section_refs()[:5] gemRefList = gem.all_section_refs() gemInd = 0 num_matched = 0 num_searched = 0 link_list = [] log = open("yad_ramah.log","w") rt_log = open("yad_ramah_rashei_tevot.csv","w") rt_log_csv = unicodecsv.DictWriter(rt_log, fieldnames=["abbrev","expanded","context_before","context_after"]) rt_log_csv.writeheader() for yrRef in yrRefList: while gemRefList[gemInd].sections[0] != yrRef.sections[0]: gemInd += 1 gemRef = gemRefList[gemInd] print("----- {} -----".format(gemRef)) log.write("----- {} -----\n".format(gemRef)) yrtc = TextChunk(yrRef,'he') # let's extend the range of gemara_tc to account for weird rashba stuff num_refs_to_expand = 2 gemara_ref_before = gemRef.prev_section_ref() gemara_ref_after = gemRef.next_section_ref() if gemara_ref_before and len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand: gemRef = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(gemRef) if gemara_ref_after and len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand: gemRef = gemRef.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1]) gemtc = TextChunk(gemRef,'he') def base_tokenizer(string): return dhm.get_maximum_dh(gemtc,string,max_dh_len=6) matched = dhm.match_ref(gemtc, yrtc, base_tokenizer=base_tokenizer, dh_extract_method=dh_extraction_method, verbose=True, with_abbrev_matches=True, rashi_filter=rashi_filter) ref_map = [(base, comment) for base, comment in zip(matched['matches'], matched['comment_refs'])] abbrevs = [am for seg in matched['abbrevs'] for am in seg] for am in abbrevs: rt_log_csv.writerow( {'abbrev': dhm.cleanAbbrev(am.abbrev), 'expanded': ' '.join(am.expanded), 'context_before': ' '.join(am.contextBefore), 'context_after': ' '.join(am.contextAfter)}) temp_link_list = [l for l in ref_map if not l[0] is None and not l[1] is None] link_list += temp_link_list unlink_list = [ul[1] for ul in ref_map if ul[0] is None or ul[1] is None] for r in ref_map: if not r[0] is None: num_matched += 1 num_searched += len(ref_map) print("MATCHES - {}".format(ref_map)) print("ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100)) log.write("MATCHES - {}\n".format(temp_link_list)) log.write("NOT FOUND - {}\n".format(unlink_list)) log.write("ACCURACY - {}%\n".format(round(1.0 * num_matched / num_searched, 5) * 100)) doc = {"link_list": [[link[0].normal(), link[1].normal()] for link in link_list]} fp = codecs.open("yad_ramah_links.json", "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False) fp.close() log.close() rt_log.close()
i_rif = 0 i_gem = 0 last_update_was_zero = False while i_rif < len(rif_segs) - rif_window and i_gem < len( gem_segs) - gem_window: temp_rif_tc = rif_segs[i_rif].to(rif_segs[i_rif + rif_window]).text("he") temp_gem_tc = gem_segs[i_gem].to(gem_segs[i_gem + gem_window]).text( lang="he", vtitle=vtitle) print("{}, {}, {}".format(temp_rif_tc, temp_gem_tc, len(links))) matched = dibur_hamatchil_matcher.match_ref( temp_gem_tc, temp_rif_tc, base_tokenizer=tokenize_words, dh_extract_method=dh_extraction_method, verbose=False, with_abbrev_matches=True) first_matches = matched['matches'] match_indices = matched['match_word_indices'] for i in range(1, 5): # let's try again, but with shorter dhs and imposing order start_pos = i * 2 def dh_extraction_method_short(s): dh = dh_extraction_method(s) dh_split = re.split(r'\s+', dh)
def base_tokenizer(str): str = re.sub(ur"\([^\(\)]+\)", u"", str) word_list = re.split(ur"\s+", str) word_list = [w for w in word_list if w] # remove empty strings return word_list def dh_extraction_method(str): m = re.match(ur"([^\.]+\.\s)?([^–]+)\s–", str) if m: return m.group(2) else: return "" yo = dibur_hamatchil_matcher.match_ref(base_tc, comment_list, base_tokenizer=base_tokenizer) for i,yoyo in enumerate(yo): if yoyo is None: num_missed += 1 if i > 0 and not yo[i-1] is None and not yoyo is None: prange = yo[i-1].range_list() nrange = yoyo.range_list() if prange[-1] == nrange[0]: print "{} is split".format(nrange[0]) num_split += 1 total_sef += len(yoyo.range_list()) if yoyo else 0 total_koren += 1
rif = Ref("Rif {}".format(mes)) gem = Ref("{}".format(mes)) rif_segs = rif.text("he").nonempty_subrefs() vtitle = 'William Davidson Edition - Aramaic' if mes in all_william else None gem_segs = gem.text(lang="he",vtitle=vtitle).nonempty_subrefs() i_rif = 0 i_gem = 0 last_update_was_zero = False while i_rif < len(rif_segs) - rif_window and i_gem < len(gem_segs) - gem_window: temp_rif_tc = rif_segs[i_rif].to(rif_segs[i_rif + rif_window]).text("he") temp_gem_tc = gem_segs[i_gem].to(gem_segs[i_gem + gem_window]).text(lang="he", vtitle=vtitle) print "{}, {}, {}".format(temp_rif_tc, temp_gem_tc, len(links)) matched = dibur_hamatchil_matcher.match_ref(temp_gem_tc, temp_rif_tc, base_tokenizer=tokenize_words, dh_extract_method=dh_extraction_method, verbose=False, with_abbrev_matches=True) first_matches = matched['matches'] match_indices = matched['match_word_indices'] for i in range(1, 5): # let's try again, but with shorter dhs and imposing order start_pos = i * 2 def dh_extraction_method_short(s): dh = dh_extraction_method(s) dh_split = re.split(ur'\s+', dh) if len(dh_split) > start_pos + 4:
def match_multiple(self, base_patterns, split_into_base_texts, rules, dh_extraction_methods, base_tokenizers, rashi_filters, matched_dir, not_matched_dir): """ This function is used when a commentary matches to multiple base texts. e.g. Maharam Shif sometimes links to Rashi, sometimes Gemara :param list: list of base text patterns. e.g. for Ritva the pattern would be "Ritva on". for Gemara, the battern would be "" because the mesechta is appended automatically. len() == len(rules) :param function split_into_base_texts: f(list[str], TextChunk) -> list, list function that takes rules and outputs which refs in the commentary should be matched to which base text. For an example implementation, look at Sefaria-Data/research/dibur_hamatchil/dh_source_scripts/gemara_commentaries/maharam_shif/maharam_shif.py :param list rules: list of regex to discriminate into different base texts :param list dh_extraction_methods: list of dh_extraction_methods. len() == len(rules :param list base_tokenizers: list of base_tokenizers. len() == len(rules) :param list rashi_filters: ditto :param str matched_dir: :param str not_matched_dir: :return: None """ num_matched = 0 num_searched = 0 for mesechta in self.mes_list: link_list = [] unlink_list = [] comment = library.get_index("{} {}".format(self.commentary_pattern, mesechta)) comment_ref_list = comment.all_section_refs() for icomment, comment_ref in enumerate(comment_ref_list): daf = comment_ref.normal_last_section() print u'-----{} {} Start ({}/{})-----'.format(mesechta, daf, icomment, len(comment_ref_list)) comment_tc = TextChunk(comment_ref, "he") splitted, oto_dibur = split_into_base_texts(rules, comment_tc) for (temp_comment_refs, temp_comment_texts), temp_dh_extract, temp_base_tokenizer, temp_rashi_filter, base_pattern in zip(splitted, dh_extraction_methods, base_tokenizers, rashi_filters, base_patterns): print u"--- DOING {} {} ---".format(base_pattern, mesechta) temp_base_ref = Ref("{} {} {}".format(base_pattern, mesechta, daf)) num_refs_to_expand = 2 gemara_ref_before = temp_base_ref.prev_section_ref() gemara_ref_after = temp_base_ref.next_section_ref() if gemara_ref_before: try: if len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand: temp_base_ref = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(temp_base_ref) else: temp_base_ref = gemara_ref_before.all_subrefs()[0].to(temp_base_ref) except InputError: pass # there was a problem extending. ignore if gemara_ref_after: try: if len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand: temp_base_ref = temp_base_ref.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1]) else: temp_base_ref = temp_base_ref.to(gemara_ref_after.all_subrefs()[-1]) except InputError: pass temp_base_tc = temp_base_ref.text("he") try: matched = dibur_hamatchil_matcher.match_ref(temp_base_tc, temp_comment_texts, temp_base_tokenizer, dh_extract_method=temp_dh_extract, verbose=False, with_abbrev_matches=True, boundaryFlexibility=10000, char_threshold=0.4, rashi_filter=temp_rashi_filter) except IndexError as e: print e continue first_matches = matched['matches'] match_indices = matched['match_word_indices'] for i in range(1, 5): # let's try again, but with shorter dhs and imposing order start_pos = i * 2 def dh_extraction_method_short(s): dh = temp_dh_extract(s) dh_split = re.split(ur'\s+', dh) if len(dh_split) > start_pos + 4: dh = u' '.join(dh_split[start_pos:start_pos + 4]) return dh matched = dibur_hamatchil_matcher.match_ref(temp_base_tc, temp_comment_texts, temp_base_tokenizer, dh_extract_method=dh_extraction_method_short, verbose=False, with_abbrev_matches=True, boundaryFlexibility=4, prev_matched_results=match_indices, rashi_filter=temp_rashi_filter) match_indices = matched['match_word_indices'] matches = matched['matches'] ref_map = zip(matches, temp_comment_refs) # assumption that rashi_filter doesn't do anything # add oto_diburs to ref_map for br, cr in reversed(ref_map): if str(cr) in oto_dibur: oto_dibured = oto_dibur[str(cr)] for od in oto_dibured: ref_map += [(br, od)] #TODO add super-base link if this is a super-commentary temp_link_list = [[str(l[0]), str(l[1])] for l in ref_map if not l[0] is None and not l[1] is None] link_list += temp_link_list temp_unlink_list = [str(ul[1]) for ul in ref_map if ul[0] is None or ul[1] is None] unlink_list += temp_unlink_list for r in ref_map: if not r[0] is None: num_matched += 1 num_searched += len(ref_map) print "MATCHES - {}".format(ref_map) for first, second in zip(first_matches, matches): if first is None and not second is None: print u"GOT {}".format(second) acc = round(1.0 * num_matched / num_searched, 5) * 100 if num_searched > 0 else 0.0 print "ACCURACY - {}%".format(acc) # log.write("MATCHES - {}\n".format(temp_link_list)) # log.write("NOT FOUND - {}\n".format(unlink_list)) # log.write("ACCURACY - {}%\n".format(round(1.0 * num_matched / num_searched, 5) * 100)) print u'----- {} {} End -----'.format(mesechta, daf) with open('{}/{}.json'.format(matched_dir, mesechta), 'wb') as out: json.dump(link_list, out, indent=4) with open('{}/{}.json'.format(not_matched_dir, mesechta), 'wb') as out: json.dump(unlink_list, out, indent=4)
def match(self, dh_extraction_method, base_tokenizer, rashi_filter, matched_dir, not_matched_dir): """ This function matches between a whole commentary and every one of the mesechtot it comments on e.g. all of Rashba against the mesechtot Rashba comments on It outputs json in the specified directories with the links :param dh_extraction_method: see dibur_hamatchil_matcher.match_ref() :param base_tokenizer: see dibur_hamatchil_matcher.match_ref() :param rashi_filter: see dibur_hamatchil_matcher.match_ref() :param matched_dir: directory where output of matched links will be saved :param not_matched_dir: directory where output of not_matched links will be saved :return: None """ num_matched = 0 num_searched = 0 for mesechta in self.mes_list: link_list = [] unlink_list = [] comment = library.get_index("{} {}".format(self.commentary_pattern, mesechta)) gemara = library.get_index("{}".format(mesechta)) comment_ref_list = comment.all_section_refs() gemara_ref_list = gemara.all_section_refs() gemara_ind = 0 for icomment, comment_ref in enumerate(comment_ref_list): while gemara_ref_list[gemara_ind].normal_last_section() != comment_ref.normal_last_section(): gemara_ind += 1 gemara_ref = gemara_ref_list[gemara_ind] orig_gemara_ref = gemara_ref print u'----- {} Start ({}/{})-----'.format(orig_gemara_ref, icomment, len(comment_ref_list)) comment_tc = TextChunk(comment_ref, "he") # let's extend the range of gemara_tc to account for weird rashba stuff num_refs_to_expand = 2 gemara_ref_before = gemara_ref.prev_section_ref() gemara_ref_after = gemara_ref.next_section_ref() if gemara_ref_before and len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand: gemara_ref_expanded = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(gemara_ref) if gemara_ref_after and len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand: gemara_ref_expanded = gemara_ref_expanded.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1]) vtitle = 'William Davidson Edition - Aramaic' if mesechta in self.all_william else None try: gemara_tc = TextChunk(gemara_ref_expanded, lang='he', vtitle=vtitle) except Exception: gemara_tc = TextChunk(gemara_ref, lang='he', vtitle=vtitle) matched = dibur_hamatchil_matcher.match_ref(gemara_tc, comment_tc, base_tokenizer=base_tokenizer, dh_extract_method=dh_extraction_method, verbose=False, with_abbrev_matches=True, boundaryFlexibility=10000, char_threshold=0.4, rashi_filter=rashi_filter) first_matches = matched['matches'] match_indices = matched['match_word_indices'] for i in range(1, 5): # let's try again, but with shorter dhs and imposing order start_pos = i * 2 def dh_extraction_method_short(s): dh = dh_extraction_method(s) dh_split = re.split(ur'\s+', dh) if len(dh_split) > start_pos + 4: dh = u' '.join(dh_split[start_pos:start_pos + 4]) return dh matched = dibur_hamatchil_matcher.match_ref(gemara_tc, comment_tc, base_tokenizer=base_tokenizer, dh_extract_method=dh_extraction_method_short, verbose=False, with_abbrev_matches=True, boundaryFlexibility=4, prev_matched_results=match_indices, rashi_filter=rashi_filter) match_indices = matched['match_word_indices'] if 'comment_refs' not in matched: print 'NO COMMENTS' continue matches = matched['matches'] comment_refs = matched['comment_refs'] ref_map = zip(matches, comment_refs) temp_link_list = [[str(l[0]), str(l[1])] for l in ref_map if not l[0] is None and not l[1] is None] link_list += temp_link_list temp_unlink_list = [str(ul[1]) for ul in ref_map if ul[0] is None or ul[1] is None] unlink_list += temp_unlink_list for r in ref_map: if not r[0] is None: num_matched += 1 num_searched += len(ref_map) print "MATCHES - {}".format(ref_map) for first, second in zip(first_matches, matches): if first is None and not second is None: print u"GOT {}".format(second) print "ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100) # log.write("MATCHES - {}\n".format(temp_link_list)) # log.write("NOT FOUND - {}\n".format(unlink_list)) # log.write("ACCURACY - {}%\n".format(round(1.0 * num_matched / num_searched, 5) * 100)) print u'----- {} End -----'.format(orig_gemara_ref) with open('{}/{}.json'.format(matched_dir, mesechta), 'wb') as out: json.dump(link_list, out, indent=4) with open('{}/{}.json'.format(not_matched_dir, mesechta), 'wb') as out: json.dump(unlink_list, out, indent=4)
curr_super_word = 0 while curr_word < len(words_scraped): possible_range = range(curr_word, curr_word+chunk_size) if curr_super_word < len(super_word_list) and super_word_list[curr_super_word] - 1 in possible_range: comments_scraped += [u" ".join(words_scraped[curr_word:super_word_list[curr_super_word]])] curr_word = super_word_list[curr_super_word] elif curr_super_word < len(super_word_list) and super_word_list[curr_super_word] in possible_range: temp_chunk = min(super_word_list[curr_super_word+1] - curr_word, super_chunk_size) if curr_super_word + 1 < len(super_word_list) else super_chunk_size comments_scraped += [u" ".join(words_scraped[curr_word:curr_word + temp_chunk])] super_comment_list += [len(comments_scraped)-1] curr_word += temp_chunk curr_super_word += 1 else: comments_scraped += [u" ".join(words_scraped[curr_word:curr_word + chunk_size])] curr_word += chunk_size matched = dibur_hamatchil_matcher.match_ref(daf_ref.text("he"), comments_scraped, base_tokenizer, with_abbrev_matches=True, with_num_abbrevs=False, place_consecutively=False) try: super_sefaria_list = [matched["matches"][sc].normal() for sc in super_comment_list] except AttributeError: print "OH NO!", daf_ref, matched["matches"] super_sefaria_list = [matched["matches"][sc].normal() if matched["matches"][sc] is not None else None for sc in super_comment_list] wiki_snippets = [u" <{}> ".format(bleach.clean(sim, strip=True, tags=[])).join([comments_scraped[sc-1], comments_scraped[sc]]) if sc != 0 else u" <{}> {}".format(bleach.clean(sim, strip=True, tags=[] ), comments_scraped[sc]) for sim, sc in zip(daf_scraped["super_simanim"], super_comment_list)] out_rows += [ {"Index": ind, "Daf": daf_ref.normal(), "Siman": bleach.clean(sim, strip=True, tags=[]), "Siman Ref": Ref(sim_ref).starting_ref().normal() if sim_ref is not None else u"N/A",
def match(self, dh_extraction_method, base_tokenizer, rashi_filter, matched_dir, not_matched_dir): """ This function matches between a whole commentary and every one of the mesechtot it comments on e.g. all of Rashba against the mesechtot Rashba comments on It outputs json in the specified directories with the links :param dh_extraction_method: see dibur_hamatchil_matcher.match_ref() :param base_tokenizer: see dibur_hamatchil_matcher.match_ref() :param rashi_filter: see dibur_hamatchil_matcher.match_ref() :param matched_dir: directory where output of matched links will be saved :param not_matched_dir: directory where output of not_matched links will be saved :return: None """ num_matched = 0 num_searched = 0 for mesechta in self.mes_list: link_list = [] unlink_list = [] comment = library.get_index("{} {}".format(self.commentary_pattern, mesechta)) gemara = library.get_index("{}".format(mesechta)) comment_ref_list = comment.all_section_refs() gemara_ref_list = gemara.all_section_refs() gemara_ind = 0 for icomment, comment_ref in enumerate(comment_ref_list): # set gemara ref to the same daf as our comment_ref while gemara_ref_list[gemara_ind].normal_last_section() != comment_ref.normal_last_section(): gemara_ind += 1 gemara_ref = gemara_ref_list[gemara_ind] orig_gemara_ref = gemara_ref print('----- {} Start ({}/{})-----'.format(orig_gemara_ref, icomment, len(comment_ref_list))) comment_tc = TextChunk(comment_ref, "he") # let's extend the range of gemara_tc to account for weird rashba stuff num_refs_to_expand = 2 gemara_ref_before = gemara_ref.prev_section_ref() gemara_ref_after = gemara_ref.next_section_ref() gemara_ref_expanded = gemara_ref if gemara_ref_before and len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand: gemara_ref_expanded = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(gemara_ref) if gemara_ref_after and len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand: gemara_ref_expanded = gemara_ref_expanded.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1]) vtitle = 'William Davidson Edition - Aramaic' if mesechta in self.all_william else None gemara_tc = TextChunk(gemara_ref_expanded, lang='he', vtitle=vtitle) matched = dibur_hamatchil_matcher.match_ref(gemara_tc, comment_tc, base_tokenizer=base_tokenizer, dh_extract_method=dh_extraction_method, verbose=False, with_abbrev_matches=True, boundaryFlexibility=10000, char_threshold=0.4, rashi_filter=rashi_filter) first_matches = matched['matches'] match_indices = matched['match_word_indices'] for i in range(1, 5): # let's try again, but with shorter dhs and imposing order start_pos = i * 2 def dh_extraction_method_short(s): dh = dh_extraction_method(s) dh_split = re.split(r'\s+', dh) if len(dh_split) > start_pos + 4: dh = ' '.join(dh_split[start_pos:start_pos + 4]) return dh matched = dibur_hamatchil_matcher.match_ref(gemara_tc, comment_tc, base_tokenizer=base_tokenizer, dh_extract_method=dh_extraction_method_short, verbose=False, with_abbrev_matches=True, boundaryFlexibility=4, prev_matched_results=match_indices, rashi_filter=rashi_filter) match_indices = matched['match_word_indices'] if 'comment_refs' not in matched: print('NO COMMENTS') continue matches = matched['matches'] comment_refs = matched['comment_refs'] ref_map = list(zip(matches, comment_refs)) temp_link_list = [[str(l[0]), str(l[1])] for l in ref_map if not l[0] is None and not l[1] is None] link_list += temp_link_list temp_unlink_list = [str(ul[1]) for ul in ref_map if ul[0] is None or ul[1] is None] unlink_list += temp_unlink_list for r in ref_map: if not r[0] is None: num_matched += 1 num_searched += len(ref_map) print("MATCHES - {}".format(ref_map)) for first, second in zip(first_matches, matches): if first is None and not second is None: print("GOT {}".format(second)) print("ACCURACY - {}%".format(round(1.0 * num_matched / num_searched, 5) * 100)) # log.write("MATCHES - {}\n".format(temp_link_list)) # log.write("NOT FOUND - {}\n".format(unlink_list)) # log.write("ACCURACY - {}%\n".format(round(1.0 * num_matched / num_searched, 5) * 100)) print('----- {} End -----'.format(orig_gemara_ref)) with open('{}/{}.json'.format(matched_dir, mesechta), 'wb') as out: json.dump(link_list, out, indent=4) with open('{}/{}.json'.format(not_matched_dir, mesechta), 'wb') as out: json.dump(unlink_list, out, indent=4)
def match_multiple(self, base_patterns, split_into_base_texts, rules, dh_extraction_methods, base_tokenizers, rashi_filters, matched_dir, not_matched_dir): """ This function is used when a commentary matches to multiple base texts. e.g. Maharam Shif sometimes links to Rashi, sometimes Gemara :param list: list of base text patterns. e.g. for Ritva the pattern would be "Ritva on". for Gemara, the battern would be "" because the mesechta is appended automatically. len() == len(rules) :param function split_into_base_texts: f(list[str], TextChunk) -> list, list function that takes rules and outputs which refs in the commentary should be matched to which base text. For an example implementation, look at Sefaria-Data/research/dibur_hamatchil/dh_source_scripts/gemara_commentaries/maharam_shif/maharam_shif.py :param list rules: list of regex to discriminate into different base texts :param list dh_extraction_methods: list of dh_extraction_methods. len() == len(rules :param list base_tokenizers: list of base_tokenizers. len() == len(rules) :param list rashi_filters: ditto :param str matched_dir: :param str not_matched_dir: :return: None """ num_matched = 0 num_searched = 0 for mesechta in self.mes_list: link_list = [] unlink_list = [] comment = library.get_index("{} {}".format(self.commentary_pattern, mesechta)) comment_ref_list = comment.all_section_refs() for icomment, comment_ref in enumerate(comment_ref_list): daf = comment_ref.normal_last_section() print('-----{} {} Start ({}/{})-----'.format(mesechta, daf, icomment, len(comment_ref_list))) comment_tc = TextChunk(comment_ref, "he") splitted, oto_dibur = split_into_base_texts(rules, comment_tc) for (temp_comment_refs, temp_comment_texts), temp_dh_extract, temp_base_tokenizer, temp_rashi_filter, base_pattern in zip(splitted, dh_extraction_methods, base_tokenizers, rashi_filters, base_patterns): print("--- DOING {} {} ---".format(base_pattern, mesechta)) temp_base_ref = Ref("{} {} {}".format(base_pattern, mesechta, daf)) num_refs_to_expand = 2 gemara_ref_before = temp_base_ref.prev_section_ref() gemara_ref_after = temp_base_ref.next_section_ref() if gemara_ref_before: try: if len(gemara_ref_before.all_subrefs()) >= num_refs_to_expand: temp_base_ref = gemara_ref_before.all_subrefs()[-num_refs_to_expand].to(temp_base_ref) else: temp_base_ref = gemara_ref_before.all_subrefs()[0].to(temp_base_ref) except InputError: pass # there was a problem extending. ignore if gemara_ref_after: try: if len(gemara_ref_after.all_subrefs()) >= num_refs_to_expand: temp_base_ref = temp_base_ref.to(gemara_ref_after.all_subrefs()[num_refs_to_expand - 1]) else: temp_base_ref = temp_base_ref.to(gemara_ref_after.all_subrefs()[-1]) except InputError: pass temp_base_tc = temp_base_ref.text("he") try: matched = dibur_hamatchil_matcher.match_ref(temp_base_tc, temp_comment_texts, temp_base_tokenizer, dh_extract_method=temp_dh_extract, verbose=False, with_abbrev_matches=True, boundaryFlexibility=10000, char_threshold=0.4, rashi_filter=temp_rashi_filter) except IndexError as e: print(e) continue first_matches = matched['matches'] match_indices = matched['match_word_indices'] for i in range(1, 5): # let's try again, but with shorter dhs and imposing order start_pos = i * 2 def dh_extraction_method_short(s): dh = temp_dh_extract(s) dh_split = re.split(r'\s+', dh) if len(dh_split) > start_pos + 4: dh = ' '.join(dh_split[start_pos:start_pos + 4]) return dh matched = dibur_hamatchil_matcher.match_ref(temp_base_tc, temp_comment_texts, temp_base_tokenizer, dh_extract_method=dh_extraction_method_short, verbose=False, with_abbrev_matches=True, boundaryFlexibility=4, prev_matched_results=match_indices, rashi_filter=temp_rashi_filter) match_indices = matched['match_word_indices'] matches = matched['matches'] ref_map = list(zip(matches, temp_comment_refs)) # assumption that rashi_filter doesn't do anything # add oto_diburs to ref_map for br, cr in reversed(ref_map): if str(cr) in oto_dibur: oto_dibured = oto_dibur[str(cr)] for od in oto_dibured: ref_map += [(br, od)] #TODO add super-base link if this is a super-commentary temp_link_list = [[str(l[0]), str(l[1])] for l in ref_map if not l[0] is None and not l[1] is None] link_list += temp_link_list temp_unlink_list = [str(ul[1]) for ul in ref_map if ul[0] is None or ul[1] is None] unlink_list += temp_unlink_list for r in ref_map: if not r[0] is None: num_matched += 1 num_searched += len(ref_map) print("MATCHES - {}".format(ref_map)) for first, second in zip(first_matches, matches): if first is None and not second is None: print("GOT {}".format(second)) acc = round(1.0 * num_matched / num_searched, 5) * 100 if num_searched > 0 else 0.0 print("ACCURACY - {}%".format(acc)) # log.write("MATCHES - {}\n".format(temp_link_list)) # log.write("NOT FOUND - {}\n".format(unlink_list)) # log.write("ACCURACY - {}%\n".format(round(1.0 * num_matched / num_searched, 5) * 100)) print('----- {} {} End -----'.format(mesechta, daf)) with open('{}/{}.json'.format(matched_dir, mesechta), 'wb') as out: json.dump(link_list, out, indent=4) with open('{}/{}.json'.format(not_matched_dir, mesechta), 'wb') as out: json.dump(unlink_list, out, indent=4)
curr_word, super_chunk_size) if curr_super_word + 1 < len( super_word_list) else super_chunk_size comments_scraped += [ u" ".join(words_scraped[curr_word:curr_word + temp_chunk]) ] super_comment_list += [len(comments_scraped) - 1] curr_word += temp_chunk curr_super_word += 1 else: comments_scraped += [ u" ".join(words_scraped[curr_word:curr_word + chunk_size]) ] curr_word += chunk_size matched = dibur_hamatchil_matcher.match_ref(daf_ref.text("he"), comments_scraped, base_tokenizer, with_abbrev_matches=True, with_num_abbrevs=False, place_consecutively=False) try: super_sefaria_list = [ matched["matches"][sc].normal() for sc in super_comment_list ] except AttributeError: print "OH NO!", daf_ref, matched["matches"] super_sefaria_list = [ matched["matches"][sc].normal() if matched["matches"][sc] is not None else None for sc in super_comment_list ]