def Gemara(self, daf, gemara_in_order): self.maharam_line += 1 self.gemara_line += 1 gemara_in_order[self.gemara_line] = gemara_in_order[ self.gemara_line].replace('0:', '') if gemara_in_order[self.gemara_line].find('-') >= 0: in_order, out_order = gemara_in_order[self.gemara_line].split('-') else: in_order = gemara_in_order[self.gemara_line] out_order = in_order masechet_daf_line_start = masechet + " " + AddressTalmud.toStr( "en", daf) + ":" + in_order masechet_daf_line_end = masechet + " " + AddressTalmud.toStr( "en", daf) + ":" + out_order try: masechet_daf_line = Ref(masechet_daf_line_start).to( Ref(masechet_daf_line_end)).normal() except: masechet_daf_line = masechet_daf_line_start self.links_to_post.append({ "refs": [ masechet_daf_line, "Maharam on " + masechet + "." + AddressTalmud.toStr("en", daf) + "." + str(self.maharam_line) ], "type": "commentary", "auto": True, "generated_by": "Maharam on " + masechet + " linker", })
def post(text, dh_dict, tractate): text_array = convertDictToArray(text) send_text = { "text": text_array, "versionTitle": "Ramban on Talmud", "versionSource": "http://www.sefaria.org", "language": "he" } post_text("Chiddushei Ramban on " + tractate, send_text) links_to_post = [] daf_array = get_text_plus(tractate)['he'] match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False) for daf in sorted(dh_dict.keys()): dh_list = dh_dict[daf] results = match.match_list( dh_list, daf_array[daf - 1], tractate + " " + AddressTalmud.toStr("en", daf)) for key, value in results.iteritems(): value = value.replace("0:", "") talmud_end = tractate + "." + AddressTalmud.toStr( "en", daf) + "." + value ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr( "en", daf) + "." + str(key) links_to_post.append({ 'refs': [talmud_end, ramban_end], 'type': 'commentary', 'auto': 'True', 'generated_by': "ramban" + tractate }) post_link(links_to_post)
def post(text, dh_dict, tractate): text_array = convertDictToArray(text) send_text = { "text": text_array, "versionTitle": "Chiddushei HaRamban, Jerusalem 1928-29", "versionSource": "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001294828", "language": "he" } post_text("Chiddushei Ramban on " + tractate, send_text) links_to_post = [] for daf in sorted(dh_dict.keys()): dh_list = dh_dict[daf] daf_text = Ref(tractate + " " + AddressTalmud.toStr("en", daf)).text('he').text results = match.match_list( dh_list, daf_text, tractate + " " + AddressTalmud.toStr("en", daf)) for key, value in results.iteritems(): value = value.replace("0:", "") talmud_end = tractate + "." + AddressTalmud.toStr( "en", daf) + "." + value talmud_end = tractate + "." + AddressTalmud.toStr( "en", daf) + "." + value ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr( "en", daf) + "." + str(key) links_to_post.append({ 'refs': [talmud_end, ramban_end], 'type': 'commentary', 'auto': 'True', 'generated_by': "ramban" + tractate }) post_link(links_to_post)
def match_and_link(dhs, masechet): def base_tokenizer(str): str = re.sub(ur"\([^\(\)]+\)", u"", str) word_list = re.split(ur"\s+", str) word_list = [w for w in word_list if w] # remove empty strings return word_list links = [] for daf in dhs: talmud_text = TextChunk(Ref(masechet + "." + AddressTalmud.toStr("en", daf)), lang="he") result = match_ref(talmud_text, dhs[daf], base_tokenizer=base_tokenizer, create_ranges=True)['matches'] if result != [None]: for count, line in enumerate(result): assert line is not None Ritva_end = "Ritva on " + masechet + "." + str( AddressTalmud.toStr("en", daf)) + "." + str(count + 1) talmud_end = line.normal() links.append({ 'refs': [Ritva_end, talmud_end], 'type': 'commentary', 'auto': 'True', 'generated_by': masechet + "Ritva" }) post_link(links)
def print_out_refs(daf, line, segment, prev_daf, prev_line, prev_segment): second = "{} {}:{}:{}".format(title, AddressTalmud.toStr("en", daf + 1), line + 1, segment + 1) first = "{} {}:{}:{}".format(title, AddressTalmud.toStr("en", prev_daf + 1), prev_line + 1, prev_segment + 1) print "First: {}".format(first) print "Second: {}\n".format(second)
def compileCommentaryIntoPage(title, daf): page = [] ref = Ref(title+" "+AddressTalmud.toStr("en", daf)+".1") while ref is not None and ref.normal().find(AddressTalmud.toStr("en", daf)) >= 0: text = ref.text('he').text for line in text: page.append(line) ref = ref.next_section_ref() if ref.next_section_ref() != ref else None return page
def find_misshing_DH(max_length): """ Run through Ritva Makkot, and search for lines with an unreasonable amount of words until the first period. :param max_length: :return: """ text={} count, lines = 0, 0 curr_daf=0 probs = codecs.open('probs_ritva.txt', 'w', 'utf-8') files = ["chiddushei one.txt","chiddushei two.txt", "chiddushei three.txt", "chiddushei four.txt", "chiddushei five.txt"] for file in files: open_file = codecs.open(file, 'r', 'utf-8') for line in open_file: line = line.replace('\n','') if len(line)==0: continue if line.find(u"#")>=0: start=line.find(u"#1") end=line.find(u"#2") if start>end or start==-1 or end==-1: print '# error' daf = line[start:end] if daf.find(u'ע"ב')>=0: curr_daf += 1 elif daf.find(u'דף')>=0: daf = daf.split(u" ")[1] poss_daf = 2*getGematria(daf)-1 if poss_daf < curr_daf: print 'daf error' curr_daf = poss_daf else: print 'no daf' line = line.replace('@1','').replace('@2','') words = line.split() for index, word in enumerate(words): lines += 1 if word.find(u'.') >= 0: break elif index > max_length: probs.write('file: ' + str(file) + "\n") probs.write('current daf:' + AddressTalmud.toStr('en', curr_daf) + "\n") probs.write('line without DH:\t' + ' '.join(words[:max_length]) + "\n\n\n") count += 1 break else: probs.write(u'file: ' + str(file) + u"\n") probs.write(u'current daf:' + AddressTalmud.toStr('en', curr_daf) + u"\n") probs.write(u'line without DH:\t' + u' '.join(words) + u"\n\n\n") count += 1 print count, lines
def compileCommentaryIntoPage(title, daf): page = [] next = title + " " + AddressTalmud.toStr("en", daf) + ".1" while next is not None and next.find(AddressTalmud.toStr("en", daf)) >= 0: text = get_text_plus(next) for line in text['he']: page.append(line) next = text['next'] return page
def compileCommentaryIntoPage(title, daf): page = [] next = title+" "+AddressTalmud.toStr("en", daf)+".1" while next is not None and next.find(AddressTalmud.toStr("en", daf)) >= 0: text = get_text_plus(next) for line in text['he']: page.append(line) next = text['next'] return page
def match_and_link(dhs, masechet): match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False) links = [] for daf in dhs: talmud_text = get_text_plus(masechet+"."+AddressTalmud.toStr("en", daf))['he'] result = match.match_list(dhs[daf], talmud_text) for line in result: talmud_range = result[line].replace("0:", "") Ritva_end = "Ritva on "+masechet+"."+str(AddressTalmud.toStr("en", daf))+"."+str(line) talmud_end = masechet + "." + AddressTalmud.toStr("en", daf) + "." + talmud_range links.append({'refs': [Ritva_end, talmud_end], 'type': 'commentary', 'auto': 'True', 'generated_by': masechet+"Ritva"}) post_link(links)
def lookForLineInCommentary(title, daf, line_n): total_count = 0 next = title + " " + AddressTalmud.toStr("en", daf) + ":1" while next.find(AddressTalmud.toStr("en", daf)) >= 0: text = get_text_plus(next) local_count = 0 for line in text['he']: local_count += 1 total_count += 1 if total_count == line_n: return next + "." + str(local_count) next = text['next'] return ""
def lookForLineInCommentary(title, daf, line_n): total_count = 0 ref = Ref(title+" "+AddressTalmud.toStr("en", daf)+":1") while ref is not None and ref.normal().find(AddressTalmud.toStr("en", daf)) >= 0: text = ref.text('he').text local_count = 0 for line in text: local_count+=1 total_count+=1 if total_count == line_n: return ref.normal()+"."+str(local_count) ref = ref.next_section_ref() if ref.next_section_ref() != ref else None return ""
def lookForLineInCommentary(title, daf, line_n): total_count = 0 next = title+" "+AddressTalmud.toStr("en", daf)+":1" while next.find(AddressTalmud.toStr("en", daf)) >= 0: text = get_text_plus(next) local_count = 0 for line in text['he']: local_count+=1 total_count+=1 if total_count == line_n: return next+"."+str(local_count) next = text['next'] return ""
def postLinks(self): def base_tokenizer(str): str = re.sub(ur"\([^\(\)]+\)", u"", str) word_list = re.split(ur"\s+", str) word_list = [w for w in word_list if w] # remove empty strings return word_list mishnah_in_order = {} mishnah_out_order = {} links_to_post = [] for daf in sorted(self.dh1_dict.keys()): if daf < 179: continue print daf self.maharam_line = 0 self.rashi_line = -1 self.tosafot_line = -1 self.gemara_line = -1 mishnah_line = 0 tosafot1_arr = self.tosafot1_dict[daf] rashi1_arr = self.rashi1_dict[daf] gemara1_arr = self.gemara1_dict[daf] print "matching tosafot"+str(len(tosafot1_arr)) tosafot_text = Ref("Tosafot on "+masechet+"."+AddressTalmud.toStr("en", daf)).text('he') tosafot1_arr = [text.decode('utf-8') for text in tosafot1_arr] tosafot_in_order = match_ref(tosafot_text, tosafot1_arr, base_tokenizer, self.dh_extract_method, verbose=True) tosafot_in_order = self.convertToOldFormat(tosafot_in_order) if not (masechet == "Bava Batra" and daf > 57): print "matching rashi"+str(len(rashi1_arr)) rashi_text = Ref("Rashi on "+masechet+"."+AddressTalmud.toStr("en", daf)).text('he') rashi1_arr = [text.decode('utf-8') for text in rashi1_arr] rashi_in_order = match_ref(rashi_text, rashi1_arr, base_tokenizer, self.dh_extract_method, verbose=True) rashi_in_order = self.convertToOldFormat(rashi_in_order) print "matching gemara"+str(len(gemara1_arr)) gemara_text = Ref(masechet+" "+AddressTalmud.toStr("en", daf)).text('he') gemara1_arr = [text.decode('utf-8') for text in gemara1_arr] gemara_in_order = match_ref(gemara_text, gemara1_arr, base_tokenizer, self.dh_extract_method, verbose=True) gemara_in_order = self.convertToOldFormat(gemara_in_order) dh1_arr = self.dh1_dict[daf] print "done matching" for category, dh in self.dh1_dict[daf]: print category if category == 'rashi' or category == 'tosafot': self.RashiOrTosafot(daf, category, rashi_in_order, tosafot_in_order) elif category == 'gemara': self.Gemara(daf, gemara_in_order) #elif category == "mishnah": # self.Mishnah(daf, mishnah_in_order) elif category == 'paragraph' and self.maharam_line == 0: self.maharam_line+=1 post_link(self.links_to_post)
def Gemara(self, daf, results): self.maharam_line+=1 self.which_line['gemara']+=1 if results['gemara'][self.which_line['gemara']] == '0': self.missing_ones.append(self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)) else: self.links_to_post.append({ "refs": [ results['gemara'][self.which_line['gemara']], self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line) ], "type": "commentary", "auto": True, "generated_by": self.title+self.masechet+" linker", })
def Gemara(self, daf, gemara_in_order): self.maharam_line+=1 self.gemara_line+=1 if gemara_in_order[self.gemara_line] == '0': self.missing_ones.append("Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)) else: self.links_to_post.append({ "refs": [ gemara_in_order[self.gemara_line], "Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line) ], "type": "commentary", "auto": True, "generated_by": "Maharam on "+masechet+" linker", })
def getTC(self, category, daf, masechet): if category == "tosafot": return Ref("Tosafot on " + masechet + "." + AddressTalmud.toStr("en", daf)).text('he') elif category == "gemara": return Ref(masechet + " " + AddressTalmud.toStr("en", daf)).text('he') elif category == "rashi": rashi = Ref("Rashi on " + masechet + "." + AddressTalmud.toStr("en", daf)).text('he') if len(rashi.text) == 0: return Ref("Rashbam on " + masechet + "." + AddressTalmud.toStr("en", daf)).text('he') else: return rashi
def getLog(siman, result, dh_dict, comm): log = [] for key in result: line_n = result[key] if line_n[0] == 0: append_str = ( "did not find dh:\n" + str(dh_dict[siman][key - 1]) + "\n in " + title_book + ", Daf " + AddressTalmud.toStr("en", siman) + ":" ) append_str += "\nwww.sefaria.org/" + title_book.replace(" ", "_") + "." + AddressTalmud.toStr("en", siman) append_str += "\ntext:<b>" + str(dh_dict[siman][key - 1]) + ".</b> " + str(comm[siman][key - 1]) + "\n\n" log.append(append_str) elif len(line_n) > 1: bestGuess = line_n[0] guess_str = ( "looked for dh:\n" + str(dh_dict[siman][key - 1]) + "\n in " + title_book + ", Daf " + AddressTalmud.toStr("en", siman) ) guess_str += " and guessed the dh matches to line " + str(bestGuess) + ":" title_c = title_comm.replace(" ", "_") guess_str += "\nwww.sefaria.org/" + title_c + "." + AddressTalmud.toStr("en", siman) + "." + str(bestGuess) guess_str += "\nbut other options include:\n" for guess in line_n: if guess != line_n[0]: title = title_book.replace(" ", "_") guess_str += ( "line " + str(guess) + ": www.sefaria.org/" + title + "." + AddressTalmud.toStr("en", siman) + "." + str(guess) + " ,\n" ) guess_str = guess_str[0:-1] log.append(guess_str + "\n\n") return log
def Commentary(self, daf, category, results): self.maharam_line += 1 self.which_line[category] += 1 title = category.title() + " on " + self.masechet base_ref = results[category][self.which_line[category]] comm_ref = self.title + " on " + self.masechet + "." + AddressTalmud.toStr( "en", daf) + "." + str(self.maharam_line) if base_ref == '0': self.missing_ones.append(comm_ref) else: self.links_to_post.append({ "refs": [base_ref, comm_ref], "type": "commentary", "auto": True, "generated_by": self.title + self.masechet + " linker" }) gemara_ref = self.getGemaraRef(base_ref) self.links_to_post.append({ "refs": [comm_ref, gemara_ref], "type": "commentary", "auto": True, "generated_by": self.title + self.masechet + " linker" })
def Commentary(self, daf, category, results): self.maharam_line += 1 self.which_line[category] += 1 title = category.title() + " on " + self.masechet base_ref = results[category][self.which_line[category]] comm_ref = self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line) if base_ref == '0': self.missing_ones.append(comm_ref) else: self.links_to_post.append({ "refs": [ base_ref, comm_ref ], "type": "commentary", "auto": True, "generated_by": self.title+self.masechet+" linker" }) gemara_ref = self.getGemaraRef(base_ref) self.links_to_post.append({ "refs": [ comm_ref, gemara_ref ], "type": "commentary", "auto": True, "generated_by": self.title+self.masechet+" linker" })
def find_matches(gemara, tosafot, rashi): # what needs to be done is to go through each dict and try to match everything, but check each segment that if it is בא"ד # ignore if it has a match and match it to previous segment's match # and if no match: link with previous segment (as a range) as if this comment really has no DH which is why it has no match nones = total = 0 for pairs in [(tosafot, "Tosafot on Ketubot"), (gemara, "Ketubot"), (rashi, "Rashi on Ketubot")]: orig_dict = dict(pairs[0]) which_dict = pairs[0] which_text = pairs[1] for daf in which_dict.keys(): actual_daf = AddressTalmud.toStr("en", daf) base_text = TextChunk(Ref("{} {}".format(which_text, actual_daf)), lang='he') if not base_text.text: continue comments = which_dict[daf] results = match_ref(base_text, comments, lambda x: x.split(), dh_extract_method=dh_extract) for i, result_comment in enumerate(zip(results["matches"], comments)): result, comment = result_comment comment_wout_bold = comment.replace("<b>", "").replace("</b>", "") if u"""בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]) \ or u"""שם בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]): results["matches"][i] = results["matches"][i - 1] which_dict[daf] = results["matches"] for daf in which_dict.keys(): if which_dict[daf] and orig_dict[daf]: which_dict[daf] = create_ranges(orig_dict, which_dict, which_text, daf) return gemara, tosafot, rashi
def getDaf(self, line, current_daf, len_masechet, prev_line): prev_num = self.current_daf orig_line = line line = line.replace("@11 ", "@11") if line.split(" ")[0].find('דף') >= 0: daf_value = getGematria( line.split(" ")[1].replace('"', '').replace("'", '')) if line.split(" ")[2].find(self.amud_bet) >= 0: self.current_daf = 2 * daf_value else: self.current_daf = 2 * daf_value - 1 actual_text = "" start_at = 3 if line.split(" ")[2] not in ['ע"ב', 'ע"א']: start_at = 2 for count, word in enumerate(line.split(" ")): if count >= start_at: actual_text += word + " " else: self.current_daf += 1 actual_text = line[3:] if self.current_daf <= prev_num: he_current = AddressTalmud.toStr("he", self.current_daf) he_prev = AddressTalmud.toStr("he", prev_num) #prev_line = " ".join(prev_line.split(" ")[0:5]) #orig_line = " ".join(orig_line.split(" ")[0:5]) print u"{} before {}\n".format(he_prev, he_current) self.dont_post = True #print u"The line starting: {} is {}\n".format(prev_line, he_prev) #print u"It came before the line starting {}, which is {}\n\n".format(orig_line, he_current) if not self.current_daf in self.dh1_dict: self.dh1_dict[self.current_daf] = [] for each_cat in self.categories: self.dh_by_cat[each_cat][self.current_daf] = [] self.actual_text = actual_text if self.current_daf > len_masechet: print "DAF EXTRA {} > {} in {} {}".format(self.current_daf, len_masechet, self.title, self.masechet) pass self.list_of_dafs.append(self.current_daf) return self.current_daf
def postLinks(self): def base_tokenizer(str): str = re.sub(ur"\([^\(\)]+\)", u"", str) word_list = re.split(ur"\s+", str) word_list = [w for w in word_list if w] # remove empty strings return word_list def dh_extract_method(str): str = str.replace(u'בד"ה', u'').replace(u'וכו', u'') return str ''' 1. strip out "" from dhs with list comprehension 2. make dictionary where each dh str is key and the value is its index in the array ''' links = [] for daf in self.text: dhs_arr = [dh for dh in self.dhs[daf] if len(dh) > 0] gemara_text = Ref("{} {}".format(self.tractate, AddressTalmud.toStr( "en", daf))).text('he') results = match_ref(gemara_text, dhs_arr, base_tokenizer, dh_extract_method=dh_extract_method, verbose=False)['matches'] self.makeDicts(daf) rashba_refs = [] for dh in dhs_arr: rashba_refs.append("Rashba on {} {}.{}".format( self.tractate, AddressTalmud.toStr("en", daf), self.dh_dict[daf][dh] + 1)) link_pairs = zip(rashba_refs, results) for link_pair in link_pairs: if link_pair[1]: links.append({ "refs": [link_pair[0], link_pair[1].normal()], "type": "commentary", "auto": True, "generated_by": "rashba{}".format(self.tractate) }) post_link(links, server=self.server)
def getDaf(self, line, current_daf, len_masechet, prev_line): prev_num = self.current_daf orig_line = line line = line.replace("@11 ", "@11") if line.split(" ")[0].find('דף')>=0: daf_value = getGematria(line.split(" ")[1].replace('"', '').replace("'", '')) if line.split(" ")[2].find(self.amud_bet)>=0: self.current_daf = 2*daf_value else: self.current_daf = 2*daf_value - 1 actual_text = "" start_at = 3 if line.split(" ")[2] not in ['ע"ב', 'ע"א']: start_at = 2 for count, word in enumerate(line.split(" ")): if count >= start_at: actual_text += word + " " else: self.current_daf += 1 actual_text = line[3:] if self.current_daf <= prev_num: he_current = AddressTalmud.toStr("he", self.current_daf) he_prev = AddressTalmud.toStr("he", prev_num) #prev_line = " ".join(prev_line.split(" ")[0:5]) #orig_line = " ".join(orig_line.split(" ")[0:5]) print u"{} before {}\n".format(he_prev, he_current) self.dont_post = True #print u"The line starting: {} is {}\n".format(prev_line, he_prev) #print u"It came before the line starting {}, which is {}\n\n".format(orig_line, he_current) if not self.current_daf in self.dh1_dict: self.dh1_dict[self.current_daf] = [] for each_cat in self.categories: self.dh_by_cat[each_cat][self.current_daf] = [] self.actual_text = actual_text if self.current_daf > len_masechet: print "DAF EXTRA {} > {} in {} {}".format(self.current_daf, len_masechet, self.title, self.masechet) pass self.list_of_dafs.append(self.current_daf) return self.current_daf
def RashiOrTosafot(self, daf, category, results): self.maharam_line += 1 if category == 'rashi': self.rashi_line += 1 title = 'Rashi on ' + masechet ref = results[category][self.rashi_line] elif category == 'tosafot': self.tosafot_line += 1 title = 'Tosafot on ' + masechet ref = results[category][self.tosafot_line] if ref == '0': self.missing_ones.append("Maharshal on " + masechet + "." + AddressTalmud.toStr("en", daf) + "." + str(self.maharam_line)) else: self.links_to_post.append({ "refs": [ ref, "Maharshal on " + masechet + "." + AddressTalmud.toStr("en", daf) + "." + str(self.maharam_line) ], "type": "commentary", "auto": True, "generated_by": "Maharshal on " + masechet + " linker" }) gemara_ref = self.getGemaraRef(ref) self.links_to_post.append({ "refs": [ "Maharshal on " + masechet + "." + AddressTalmud.toStr("en", daf) + "." + str(self.maharam_line), gemara_ref ], "type": "commentary", "auto": True, "generated_by": "Maharshal on " + masechet + " linker" })
def create_link_text(source_index, line_number, comment_number): amud_number = AddressTalmud.toStr('en', source_index) return { "refs": [ "Sanhedrin {}.{}".format(amud_number, line_number), "Yad Ramah on Sanhedrin {}.{}".format(amud_number, comment_number) ], "type": "commentary", }
def post(text, dh_dict, tractate): text_array = convertDictToArray(text) send_text = { "text": text_array, "versionTitle": "Ramban on Talmud", "versionSource": "http://www.sefaria.org", "language": "he" } post_text("Chiddushei Ramban on "+tractate, send_text) links_to_post = [] daf_array = get_text_plus(tractate)['he'] match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False) for daf in sorted(dh_dict.keys()): dh_list = dh_dict[daf] results = match.match_list(dh_list, daf_array[daf-1], tractate+" "+AddressTalmud.toStr("en", daf)) for key, value in results.iteritems(): value = value.replace("0:", "") talmud_end = tractate + "." + AddressTalmud.toStr("en", daf) + "." + value ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr("en", daf) + "." + str(key) links_to_post.append({'refs': [talmud_end, ramban_end], 'type': 'commentary', 'auto': 'True', 'generated_by': "ramban"+tractate}) post_link(links_to_post)
def get_matches_for_dict_and_link(dh_dict, base_text_title, commentary_title, talmud=True, lang='he', word_threshold=0.27, server="", rashi_filter=None, dh_extract_method=lambda x: x): def base_tokenizer(str): str_list = str.split(" ") return [str for str in str_list if len(str) > 0] assert len(server) > 0, "Please specify a server" results = {} links = [] matched = 0 total = 0 for daf in dh_dict: print daf dhs = dh_dict[daf] if talmud: base_text_ref = "{} {}".format(base_text_title, AddressTalmud.toStr("en", daf)) comm_ref = "{} on {} {}".format(commentary_title, base_text_title, AddressTalmud.toStr("en", daf)) else: base_text_ref = "{} {}".format(base_text_title, daf) comm_ref = "{} on {} {}".format(commentary_title, base_text_title, daf) base_text = TextChunk(Ref(base_text_ref), lang=lang) comm_text = TextChunk(Ref(comm_ref), lang=lang) results[daf] = match_ref(base_text, comm_text, base_tokenizer=base_tokenizer, word_threshold=word_threshold, rashi_filter=rashi_filter, dh_extract_method=dh_extract_method)["matches"] for count, link in enumerate(results[daf]): if link: base_end = link.normal() comm_end = "{} on {} {}:{}".format(commentary_title, base_text_title, AddressTalmud.toStr("en", daf), count+1) links.append({ "refs": [base_end, comm_end], "auto": True, "type": "commentary", "generated_by": commentary_title+base_text_title }) matched += 1 total += 1 print "Matched: {}".format(matched) print "Total {}".format(total) post_link(links, server=server) return results
def Rosh(self, perek, daf, dh, results): self.maharam_line += 1 self.rosh_line += 1 if results[perek-1][self.rosh_line]: self.links_to_post.append({ "refs": [ results[perek-1][self.rosh_line].normal(), self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line) ], "type": "commentary", "auto": True, "generated_by": self.title+self.masechet+" linker", })
def RashiOrTosafot(self, daf, category, rashi_in_order, tosafot_in_order): if category == 'rashi': self.maharam_line+=1 self.rashi_line+=1 title = 'Rashi on '+masechet in_order = rashi_in_order[self.rashi_line] elif category == 'tosafot': self.maharam_line+=1 self.tosafot_line+=1 title = 'Tosafot on '+masechet in_order = tosafot_in_order[self.tosafot_line] if in_order == '0': self.missing_ones.append("Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)) else: self.links_to_post.append({ "refs": [ in_order, "Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line) ], "type": "commentary", "auto": True, "generated_by": "Maharam on "+masechet+" linker"})
def post(text, dh_dict, tractate): text_array = convertDictToArray(text) send_text = { "text": text_array, "versionTitle": "Chiddushei HaRamban, Jerusalem 1928-29", "versionSource": "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001294828", "language": "he" } post_text("Chiddushei Ramban on "+tractate, send_text) links_to_post = [] for daf in sorted(dh_dict.keys()): dh_list = dh_dict[daf] daf_text = Ref(tractate+" "+AddressTalmud.toStr("en", daf)).text('he').text results = match.match_list(dh_list, daf_text, tractate+" "+AddressTalmud.toStr("en", daf)) for key, value in results.iteritems(): value = value.replace("0:", "") talmud_end = tractate + "." + AddressTalmud.toStr("en", daf) + "." + value talmud_end = tractate + "." + AddressTalmud.toStr("en", daf) + "." + value ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr("en", daf) + "." + str(key) links_to_post.append({'refs': [talmud_end, ramban_end], 'type': 'commentary', 'auto': 'True', 'generated_by': "ramban"+tractate}) post_link(links_to_post)
def getLog(siman, result, dh_dict, comm): log = [] for key in result: line_n = result[key] if line_n[0] == 0: append_str = "did not find dh:\n"+str(dh_dict[siman][key-1])+"\n in "+title_book+", Daf "+AddressTalmud.toStr("en", siman)+":" append_str += "\nwww.sefaria.org/"+title_book.replace(" ", "_")+"."+AddressTalmud.toStr("en", siman) append_str += "\ntext:<b>"+str(dh_dict[siman][key-1])+".</b> "+str(comm[siman][key-1])+"\n\n" log.append(append_str) elif len(line_n) > 1: bestGuess = line_n[0] guess_str = "looked for dh:\n"+str(dh_dict[siman][key-1])+"\n in "+title_book+", Daf "+AddressTalmud.toStr("en", siman) guess_str += " and guessed the dh matches to line "+str(bestGuess)+":" title_c = title_comm.replace(" ", "_") guess_str += "\nwww.sefaria.org/"+title_c+"."+AddressTalmud.toStr("en", siman)+"."+str(bestGuess) guess_str += "\nbut other options include:\n" for guess in line_n: if guess != line_n[0]: title = title_book.replace(" ", "_") guess_str += "line " +str(guess)+": www.sefaria.org/"+title+"."+AddressTalmud.toStr("en", siman)+"."+str(guess)+" ,\n" guess_str = guess_str[0:-1] log.append(guess_str+"\n\n") return log
def postLinks(self): def base_tokenizer(str): str = re.sub(ur"\([^\(\)]+\)", u"", str) word_list = re.split(ur"\s+", str) word_list = [w for w in word_list if w] # remove empty strings return word_list def dh_extract_method(str): str = str.replace(u'בד"ה', u'').replace(u'וכו', u'') return str ''' 1. strip out "" from dhs with list comprehension 2. make dictionary where each dh str is key and the value is its index in the array ''' links = [] for daf in self.text: dhs_arr = [dh for dh in self.dhs[daf] if len(dh) > 0] gemara_text = Ref("{} {}".format(self.tractate, AddressTalmud.toStr("en", daf))).text('he') results = match_ref(gemara_text, dhs_arr, base_tokenizer, dh_extract_method=dh_extract_method, verbose=False)['matches'] self.makeDicts(daf) rashba_refs = [] for dh in dhs_arr: rashba_refs.append("Rashba on {} {}.{}".format(self.tractate, AddressTalmud.toStr("en", daf), self.dh_dict[daf][dh]+1)) link_pairs = zip(rashba_refs, results) for link_pair in link_pairs: if link_pair[1]: links.append( { "refs": [ link_pair[0], link_pair[1].normal() ], "type": "commentary", "auto": True, "generated_by": "rashba{}".format(self.tractate) } ) post_link(links, server=self.server)
def match_and_link(dhs, masechet): match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False) links = [] for daf in dhs: talmud_text = get_text_plus(masechet + "." + AddressTalmud.toStr("en", daf))['he'] result = match.match_list(dhs[daf], talmud_text) for line in result: talmud_range = result[line].replace("0:", "") Ritva_end = "Ritva on " + masechet + "." + str( AddressTalmud.toStr("en", daf)) + "." + str(line) talmud_end = masechet + "." + AddressTalmud.toStr( "en", daf) + "." + talmud_range links.append({ 'refs': [Ritva_end, talmud_end], 'type': 'commentary', 'auto': 'True', 'generated_by': masechet + "Ritva" }) post_link(links)
def match_and_link(text, masechet): match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False) for daf_count, daf in enumerate(text): dhs = [] comments = [] for each_line in daf: if each_line.find("כו'") >= 0: dh, comment = each_line.split("כו'", 1) elif each_line.find(".") >= 0: dh, comment = each_line.split(".", 1) else: dh, comment = splitText(each_line, 10) dhs.append(dh) comments.append(comment) pdb.set_trace() talmud_text = get_text_plus(masechet+"."+AddressTalmud.toStr("en", daf_count+3))['he'] result = match.match_list(dhs, talmud_text)
def getTC(self, category, daf, masechet): if category in ["tosafot", "ran", "rosh"]: title = "{} on {}".format(category.title(), masechet) return Ref(title+"."+AddressTalmud.toStr("en", daf)).text('he') elif category == "gemara": return Ref(masechet+" "+AddressTalmud.toStr("en", daf)).text('he') elif category == "rashi": rashi = Ref("Rashi on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he') if len(rashi.text) == 0: print "rashbam by default {} {}".format(masechet, AddressTalmud.toStr("en", daf)) return Ref("Rashbam on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he') else: return rashi elif category == "rashbam": print "rashbam {} {}".format(masechet, AddressTalmud.toStr("en", daf)) return Ref("Rashbam on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
def Mishnah(self, daf, mishnah_in_order): self.maharam_line += 1 mishnah_line += 1 pos = 0 for perek in self.mishnah1_dict: for key in mishnah_in_order[perek]: pos += 1 if pos == mishnah_line: mishnah_in_order[perek][key] = mishnah_in_order[perek][ key].replace('0:', '') if mishnah_in_order[perek][key].find('-') >= 0: in_order, out_order = mishnah_in_order[perek][ key].split('-') else: in_order = mishnah_in_order[perek][key] out_order = in_order in_order = int(in_order) out_order = int(out_order) masechet_daf_line_start = "Mishnah " + masechet + "." + str( perek) + "." + str(mishnah_in_order[perek][key][0]) masechet_daf_line_end = "Mishnah " + masechet + "." + str( perek) + "." + str(mishnah_out_order[perek][key][0]) try: masechet_daf_line = Ref(masechet_daf_line_start).to( Ref(masechet_daf_line_end)).normal() except: masechet_daf_line = masechet_daf_line_start self.links_to_post.append({ "refs": [ masechet_daf_line, "Maharam " + masechet + "." + AddressTalmud.toStr("en", daf) + "." + str(self.maharam_line) ], "type": "commentary", "auto": True, "generated_by": "Maharam on " + masechet + " linker", })
def find_matches(gemara, tosafot, rashi): # what needs to be done is to go through each dict and try to match everything, but check each segment that if it is בא"ד # ignore if it has a match and match it to previous segment's match # and if no match: link with previous segment (as a range) as if this comment really has no DH which is why it has no match nones = total = 0 for pairs in [(tosafot, "Tosafot on Ketubot"), (gemara, "Ketubot"), (rashi, "Rashi on Ketubot")]: orig_dict = dict(pairs[0]) which_dict = pairs[0] which_text = pairs[1] for daf in which_dict.keys(): actual_daf = AddressTalmud.toStr("en", daf) base_text = TextChunk(Ref("{} {}".format(which_text, actual_daf)), lang='he') if not base_text.text: continue comments = which_dict[daf] results = match_ref(base_text, comments, lambda x: x.split(), dh_extract_method=dh_extract) for i, result_comment in enumerate( zip(results["matches"], comments)): result, comment = result_comment comment_wout_bold = comment.replace("<b>", "").replace("</b>", "") if u"""בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]) \ or u"""שם בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]): results["matches"][i] = results["matches"][i - 1] which_dict[daf] = results["matches"] for daf in which_dict.keys(): if which_dict[daf] and orig_dict[daf]: which_dict[daf] = create_ranges(orig_dict, which_dict, which_text, daf) return gemara, tosafot, rashi
def create_links(sanhedrin_ja, yad_ramah_ja): list_of_links = [] amud_number = 1 match_object = Match(in_order=True, min_ratio=80, guess=False, range=False, can_expand=True) for amud_of_sanhedrin, amud_yad_ramah in zip(sanhedrin_ja, yad_ramah_ja): ref = 'Sanhedrin {}'.format(AddressTalmud.toStr('en', amud_number)) the_first_few_words = take_the_first_few_words_of_each_paragraph( amud_yad_ramah) matches_dict = match_object.match_list(the_first_few_words, amud_of_sanhedrin, ref) for key in matches_dict: for match in matches_dict[key]: if match != 0: # print'Amud: {} comment: {} corresponds to {}'.format(AddressTalmud.toStr('en', amud_number), key, match) print create_link_text(amud_number, match, key) list_of_links.append( create_link_text(amud_number, match, key)) amud_number += 1 return list_of_links
pdb.set_trace() before_dh = "" just_added_dh = False prev_line = line temp_text = "" for daf in comm_dict.keys(): if daf not in dh_dict.keys(): pdb.set_trace() send_text = { "versionTitle": "Rashba on Bava Batra", "versionSource": "http://www.sefaria.org", "language": "en", "text": comm_dict[daf], } post_text("Rashba on Bava Batra." + AddressTalmud.toStr("en", daf), send_text) result = {} guess = 0 no_guess = 0 for daf in dh_dict.keys(): text = get_text("Bava Batra." + AddressTalmud.toStr("en", daf)) try: match_obj = Match(in_order=True, min_ratio=70, guess=False, range=True, maxLine=len(text) - 1) except: pdb.set_trace() dh_arr = []
text_dict[perek] = convertDictToArray(text_dict[perek], empty="") links = [] send_text = { "text": convertDictToArray(text_dict), "versionTitle": "Senlake edition 2019 based on Ben Yehoyada, Jerusalem, 1897", "versionSource": "http://beta.nli.org.il/he/books/NNL_ALEPH001933802/NLIl", "language": "he" } post_text("Ben Yehoyada on {}".format(title), send_text, index_count="on") for daf, text in text_dict.items(): daf = AddressTalmud.toStr("en", daf) if title != "Eduyot" else daf try: base = TextChunk(Ref("{} {}".format(title, daf)), lang='he') except InputError as e: print(e) continue try: results = match_ref(base, text, lambda x: x.split(), dh_extract_method=dher) for i, ref in enumerate(results["matches"]): if ref: berakhot = "Ben Yehoyada on {} {}:{}".format( title, daf, i + 1) links.append({
book = {} total = 0 non_match = 0 guess = 0 matched = 0 log = [] dh_dict = {} tosafot_comments = {} prev_line = 0 for j in range(78): i = j + 100 count = 0 tosafot_comments[i + 3] = [] dh_dict[i + 3] = [] he_daf = u"גיטין_" he_daf += AddressTalmud.toStr("he", i + 3) he_daf = he_daf.replace(u"\u05f4", u"") he_daf = he_daf.replace(u"׳", u"") he_daf = he_daf.replace(" ", "_") he_daf = he_daf + ".txt" f = open("../Noah-Santacruz-rashiPosting/Tosafot/" + he_daf, 'r') for line in f: line = line.replace("\n", "") something = line.replace(" ", "") if len(something) > 0: if count % 2 == 1: tosafot_comments[i + 3].append(line) dh = line.split(".")[0] dh_dict[i + 3].append(dh) count += 1 f.close()
comm = {} book = {} total = 0 non_match = 0 guess = 0 matched = 0 log = [] dh_dict = {} rashi_comments = {} prev_line = 0 for i in range(150): #152 count = 0 rashi_comments[i+3] = [] dh_dict[i+3] = [] he_daf = u"עבודה זרה_" he_daf += AddressTalmud.toStr("he", i+3) he_daf = he_daf.replace(u"\u05f4", u"") he_daf = he_daf.replace(u"׳", u"") he_daf = he_daf.replace(" ", "_") he_daf = he_daf + ".txt" f = open("../Noah-Santacruz-rashiPosting/Tosafot/"+he_daf, 'r') for line in f: line = line.replace("\n", "") something = line.replace(" ", "") if len(something) > 0: if count % 2 == 0: dh_dict[i+3].append(line) else: if line.find(" - ")==-1: line = line.replace(".", " - ", 1) rashi_comments[i+3].append(line)
comm_dict[daf].append(comm) if hasTags(comm) or hasTags(dh) or hasTags(before_dh): pdb.set_trace() if just_added_dh == False: dh_dict[daf].append("") just_added_dh = False before_dh ="" temp_text = "" result = {} guess=0 no_guess=0 for daf in dh_dict.keys(): if len(dh_dict[daf]) != len(comm_dict[daf]): pdb.set_trace() for daf in dh_dict.keys(): text = get_text("Gittin."+AddressTalmud.toStr("en", daf)) try: match_obj=Match(in_order=True, min_ratio=70, guess=False, range=True, maxLine=len(text)-1) except: pdb.set_trace() dh_arr = [] for i in range(len(dh_dict[daf])): if len(dh_dict[daf][i]) > 0: dh_arr.append(dh_dict[daf][i]) result[daf] = match_obj.match_list(dh_arr, text) dh_count = 1 ''' if len(dh_dict[daf][i]) == 0, then comm_dict[daf][i] gets added to comm_dict[daf][i-1]+"<br>" ''' for i in range(len(comm_dict[daf])): if (daf, i) in before_dh_dict:
guess = 0 matched = 0 log = [] dh_dict = {} rashi_comments = {} prev_line = 0 title_book = "Keritot" title_comm = "Tosafot on Keritot" for i in range(54): count = 0 rashi_comments[i + 3] = [] dh_dict[i + 3] = [] he_daf = u"כריתות_" he_daf += AddressTalmud.toStr("he", i + 3) he_daf = he_daf.replace(u"\u05f4", u"") he_daf = he_daf.replace(u"׳", u"") he_daf = he_daf.replace(" ", "_") he_daf = he_daf + ".txt" f = open("../Noah-Santacruz-rashiPosting/Tosafot/" + he_daf, 'r') for line in f: line = line.replace("\n", "") something = line.replace(" ", "") if len(something) > 0: if count % 2 == 0: dh_dict[i + 3].append(line) else: if line.find(" - ") == -1: line = line.replace(".", " - ", 1) rashi_comments[i + 3].append(line)
"versionSource": "http://www.sefaria.org/", "language": "he", "text": [comm], } post_text("Yad Ramah on Bava Batra, Perek "+str(current_perek)+", Comment "+str(comment_key), text) match_obj=Match(in_order=True, min_ratio=80, guess=False, range=True) skipped_arr = [] result = {} for current_perek in range(10): current_perek+=1 print current_perek search_for = 0 for daf in sorted(daf_dict[current_perek].keys()): print daf text = get_text("Bava Batra."+AddressTalmud.toStr("en", daf)) dh_list = daf_dict[current_perek][daf] result[daf] = match_obj.match_list(dh_list, text, "Bava Batra "+AddressTalmud.toStr("en", daf)) print result[daf] for key in result[daf]: if result[daf][key].find("0:") >= 0: result[daf][key] = result[daf][key].replace("0:","") search_for += 1 line_n = result[daf][key] count = 0 for comment_key in comments_order[current_perek]: count+=1 if comment_key not in comm_dict[current_perek]: if comment_key not in skipped_arr: search_for+=1 skipped_arr.append(comment_key)
book = {} total = 0 non_match = 0 guess = 0 matched = 0 log = [] dh_dict = {} tosafot_comments = {} prev_line = 0 for j in range(24): # 234 i = j + 210 count = 0 tosafot_comments[i + 3] = [] dh_dict[i + 3] = [] he_daf = u"בבא מציעא_" he_daf += AddressTalmud.toStr("he", i + 3) he_daf = he_daf.replace(u"\u05f4", u"") he_daf = he_daf.replace(u"׳", u"") he_daf = he_daf.replace(" ", "_") he_daf = he_daf + ".txt" f = open("../Noah-Santacruz-rashiPosting/Tosafot/" + he_daf, "r") for line in f: line = line.replace("\n", "") something = line.replace(" ", "") if len(something) > 0: if count % 2 == 1: tosafot_comments[i + 3].append(line) dh = line.split(".")[0] dh_dict[i + 3].append(dh) count += 1 f.close()
guess = 0 matched = 0 log = [] dh_dict = {} rashi_comments = {} prev_line = 0 title_book = "Keritot" title_comm = "Rashi on Keritot" for i in range(54): count = 0 rashi_comments[i + 3] = [] dh_dict[i + 3] = [] he_daf = u"כריתות_" he_daf += AddressTalmud.toStr("he", i + 3) he_daf = he_daf.replace(u"\u05f4", u"") he_daf = he_daf.replace(u"׳", u"") he_daf = he_daf.replace(" ", "_") he_daf = he_daf + ".txt" f = open("../Noah-Santacruz-rashiPosting/Rashi/" + he_daf, "r") for line in f: line = line.replace("\n", "") something = line.replace(" ", "") if len(something) > 0: if count % 2 == 0: dh_dict[i + 3].append(line) else: rashi_comments[i + 3].append(line) count += 1 f.close()
matched = 0 log = [] dh_dict = {} rashi_comments = {} prev_line = 0 title_book = "Avodah Zarah" title_comm = "Tosafot on Avodah Zarah" for j in range(2): i = j+ count = 0 rashi_comments[i+3] = [] dh_dict[i+3] = [] he_daf = u"עבודה_זרה_" he_daf += AddressTalmud.toStr("he", i+3) he_daf = he_daf.replace(u"\u05f4", u"") he_daf = he_daf.replace(u"׳", u"") he_daf = he_daf.replace(" ", "_") he_daf = he_daf + ".txt" f = open("../Noah-Santacruz-rashiPosting/Tosafot/"+he_daf, 'r') for line in f: line = line.replace("\n", "") something = line.replace(" ", "") if len(something) > 0: if count % 2 == 0: dh_dict[i+3].append(line) else: rashi_comments[i+3].append(line) count+=1 f.close()
print 'line did not start with 11' match_obj=Match(in_order=False, min_ratio=80, guess=False, range=False) last_daf = max(comm_dict.keys()) param = "off" for daf in comm_dict: if daf==last_daf: param = "on" send_text = { "versionTitle": "Maharam Shif on "+masechet, "versionSource": "http://www.sefaria.org", "language": "he", "text": comm_dict[daf], } post_text("Maharam Shif on "+masechet+"."+AddressTalmud.toStr("en", daf), send_text, param) for category in categories: if category=='paragraph': continue elif category=='gemara': title = masechet elif category=='rashi': title = "Rashi on "+masechet elif category=='tosafot': title = "Tosafot on "+masechet for daf in dh_dict[category]: dh_arr = dh_dict[category][daf] text = compileCommentaryIntoPage(title, daf)
pdb.set_trace() last_daf = max(comm_dict.keys()) param = "off" text_to_post = convertDictToArray(comm_dict) send_text = { "versionTitle": "Shita Mekubetzet on "+masechet, "versionSource": "http://www.sefaria.org", "language": "he", "text": text_to_post, } post_text("Shita Mekubetzet on "+masechet, send_text, "on") links_to_post = [] for daf in dh_dict: text = get_text(masechet+"."+AddressTalmud.toStr("en", daf)) match_obj=Match(in_order=True, min_ratio=85, guess=False, range=True) dh_arr = dh_dict[daf] result = match_obj.match_list(dh_arr, text, masechet+" "+AddressTalmud.toStr("en", daf)) for key in result: line_n = result[key] line_n = line_n.replace("0:","") links_to_post.append({ "refs": [ masechet+"."+AddressTalmud.toStr("en", daf)+"."+line_n, "Shita Mekubetzet on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(key) ], "type": "commentary", "auto": True, "generated_by": "Shita on "+masechet+" linker", })
comm_dict[daf] = [] comm_dict[daf].append(comm) if just_added_dh == False: dh_dict[daf].append("") if hasTags(comm) or hasTags(dh) or hasTags(before_dh): pdb.set_trace() just_added_dh = False prev_line = line result = {} guess = 0 no_guess = 0 for daf in dh_dict.keys(): if len(dh_dict[daf]) != len(comm_dict[daf]): pdb.set_trace() for daf in dh_dict.keys(): text = get_text("Niddah." + AddressTalmud.toStr("en", daf)) try: match_obj = Match(in_order=True, min_ratio=70, guess=False, range=True, maxLine=len(text) - 1) except: pdb.set_trace() dh_arr = [] for i in range(len(dh_dict[daf])): if len(dh_dict[daf][i]) > 0: dh_arr.append(dh_dict[daf][i]) result[daf] = match_obj.match_list(dh_arr, text) dh_count = 1 '''
def postLinks(self, masechet): def base_tokenizer(str): str = re.sub(ur"\([^\(\)]+\)", u"", str) word_list = re.split(ur"\s+", str) word_list = [w for w in word_list if w] # remove empty strings return word_list rosh_results = [] perek_key = {} for perek in sorted(self.dh_by_perek.keys()): tuples = filter(lambda x: x[0] is 'rosh', self.dh_by_perek[perek]) if len(tuples) > 0: cats, dhs, dappim = zip(*tuples) #for each daf and dh pair, that's the key to get the perek for daf, dh in zip(list(dappim), list(dhs)): perek_key[(daf, dh)] = perek base = Ref("Rosh on {} {}".format(masechet, perek)).text('he') assert len(base.text) > 0 these_results = match_ref( base, list(dhs), base_tokenizer, dh_extract_method=self.dh_extract_method, verbose=False, with_num_abbrevs=False)['matches'] assert len(tuples) is len(these_results) rosh_results.append(these_results) results = {} comments = {} for daf in sorted(self.dh1_dict.keys()): comments[daf] = {} results[daf] = {} for each_cat in self.categories: if each_cat == 'rosh': continue comments[daf][each_cat] = self.dh_by_cat[each_cat][daf] for each_type in comments[daf]: if each_type == 'rosh': continue results[daf][each_type] = [] if len(comments[daf][each_type]) > 0: base = self.getTC(each_type, daf, masechet) if len(base.text) == 0: self.comm_wout_base.write("{} {}: {}\n".format( masechet, daf, each_type)) base = self.getTC(each_type, daf - 1, masechet) combined_comments = comments[ daf - 1][each_type] + comments[daf][each_type] if len(base.text) == 0: print "Problem in {}".format( AddressTalmud.toStr("en", daf)) else: results[daf - 1][each_type] = match_ref( base, combined_comments, base_tokenizer, dh_extract_method=self.dh_extract_method, verbose=False, with_num_abbrevs=False) results[daf - 1][each_type] = self.convertToOldFormat( results[daf - 1][each_type]) self.dh1_dict[daf] = [ x for x in self.dh1_dict[daf] if x[0] != each_type ] else: results[daf][each_type] = match_ref( base, comments[daf][each_type], base_tokenizer, dh_extract_method=self.dh_extract_method, verbose=False, with_num_abbrevs=False) results[daf][each_type] = self.convertToOldFormat( results[daf][each_type]) prev_perek = 0 for daf in sorted(self.dh1_dict.keys()): self.maharam_line = 0 self.which_line = { "rashi": -1, "tosafot": -1, "rosh": -1, "ran": -1, "gemara": -1, "rashbam": -1 } for category, dh in self.dh1_dict[daf]: if category == 'gemara': self.Gemara(daf, results[daf]) elif category == 'rosh': perek = perek_key[(daf, dh)] if perek > prev_perek: self.rosh_line = -1 prev_perek = perek self.Rosh(perek, daf, dh, rosh_results) else: self.Commentary(daf, category, results[daf]) post_link(self.links_to_post, server=self.server) self.comm_wout_base.close()
comm_dict[daf] = [] comm_dict[daf].append(comm) if just_added_dh == False: dh_dict[daf].append("") if hasTags(comm) or hasTags(dh) or hasTags(before_dh): pdb.set_trace() just_added_dh = False prev_line = line result = {} guess=0 no_guess=0 for daf in dh_dict.keys(): if len(dh_dict[daf]) != len(comm_dict[daf]): pdb.set_trace() for daf in dh_dict.keys(): text = get_text("Niddah."+AddressTalmud.toStr("en", daf)) try: match_obj=Match(in_order=True, min_ratio=70, guess=False, range=True, maxLine=len(text)-1) except: pdb.set_trace() dh_arr = [] for i in range(len(dh_dict[daf])): if len(dh_dict[daf][i]) > 0: dh_arr.append(dh_dict[daf][i]) result[daf] = match_obj.match_list(dh_arr, text) dh_count = 1 ''' if len(dh_dict[daf][i]) == 0, then comm_dict[daf][i] gets added to comm_dict[daf][i-1]+"<br>" ''' for i in range(len(comm_dict[daf])): if (daf, i) in before_dh_dict:
zohar_struct[vol_num].append([]) first_line = True vol = open(vol_file, 'r') for line in vol: stray_tag = False blank_line = False no_spaces = line.replace(" ", "") no_return = no_spaces.replace("\n", "") if len(no_return)==0: blank_line = True if len(line.split(' '))==1 and (line.find('<b>')>=0 or line.find('</b>')>=0): stray_tag = True if first_line == True: first_line = False if curr_parsha_file != "": curr_parsha_file.write('\n'+str(prev_vol+1)+":"+AddressTalmud.toStr("en", prev_daf+1)+":"+str(prev_para)) curr_parsha_file.close() if os.path.exists(english_parshiot[curr_parsha]) == True: os.remove(english_parshiot[curr_parsha]) curr_parsha_file = open(english_parshiot[curr_parsha], 'a') curr_parsha_file.write(str(vol_num+1)+":"+AddressTalmud.toStr("en", daf_count+2)+":1") curr_parsha += 1 elif blank_line==False and stray_tag==False: prev_prev_line = prev_line prev_line = current_line new_daf = line.find('דף') new_parsha = line.find('h1') #all parsha titles are surrounded by <h1> tags if new_daf >= 0 and len(line.split(' ')) < 6: current_line = "daf" daf_count += 1 zohar_struct[vol_num].append([])