def process_book(self, mishnah_title): bavli = TalmudVolume(re.sub(" ", "_", mishnah_title[8:])) mishnah = MishnahVolume(re.sub(" ", "_", mishnah_title)) perek_start = True mishnayot_end = False while bavli.has_more(): current_mishnah = mishnah.get_current_mishnah() if not mishnayot_end else "" (starting_daf, starting_line, line) = bavli.get_next_line() m = self.matni_re.match(line) if m or perek_start: # Match mishnah keyword self.log.write(u"Found Mishnah start at {}:{}\n{}\n".format(starting_daf, starting_line, line)) ending_daf = starting_daf ending_line = starting_line if perek_start and not m: # Perek starts with no "Mishna" - contents of `line` are fine pass else: if len(m.group(2)) == 0: # bareword "Mishna" - get next line (ending_daf, ending_line, line) = bavli.get_next_line() elif len(m.group(2)) > 0: # "Mishna" followed by content, twim off "Mishna" line = m.group(2) line = self.replace_roshei_tevot(line) if fuzz.partial_ratio(line, current_mishnah) > 60: self.log.write(u"Matched a starting line in the Mishnah: {}\n{}\n".format(line, current_mishnah)) mishnah_line_match_length, mishnah_line_match_threshold, max_lines = self.get_match_thresholds(bavli.title, mishnah.current_chapter) starting_mishnah = mishnah.current_mishnah if mishnayot_end: msg = u"Error: Found too many mishnayot in {} {}!\n".format(bavli.title, mishnah.current_chapter) print msg self.log.write(msg) self.error_log.write(msg) self.csv_writer.writerow([bavli.title, mishnah.current_chapter, u"?", u"?", starting_daf, starting_line]) lines_in_match = 1 while not self.gemarah_re.search(line) and u'\u05d4\u05d3\u05e8\u05df \u05e2\u05dc\u05da' not in line: # Check for 'Gemara' or 'Hadran' (foo, bar, line) = bavli.get_next_line() lines_in_match += 1 lines_to_get = max_lines if max_lines <= lines_in_match else lines_in_match (ending_daf, ending_line, last_bavli_segment) = bavli.get_previous_lines(lines_to_get) last_bavli_segment = last_bavli_segment.strip()[-mishnah_line_match_length:] # Open up Roshei Teivot last_bavli_segment = self.replace_roshei_tevot(last_bavli_segment) ending_mishnah = None for i in range(mishnah.number_left_in_chapter() + 1): m = mishnah.get_next_mishnah(i) assert len(last_bavli_segment) < len(m) (ratio, offset_start, offset_ending) = fuzz.partial_with_place(m, last_bavli_segment) if ratio < mishnah_line_match_threshold: mesg = u"Failed to match last Talmud line to Mishnah: \n{}\n{}\n\n".format(last_bavli_segment, m) self.log.write(mesg) continue self.log.write(u"Succeeded to match last Talmud line to Mishanh: \n{}\n{}\n\n".format(last_bavli_segment, m)) ending_mishnah = mishnah.current_mishnah + i if offset_ending < len(m) - self.end_of_mishnah_fudge_character_length: # Match ended in middle of a mishnah. Number at end is close-enough-to-end fudge factor. mishnah.advance_pointer(mishnah.current_chapter, ending_mishnah, offset_ending + 1) self.log.write(u"Internal match {} in {}\n - advanced Mishnah offset: {}, {}, {}\n".format(last_bavli_segment, m, mishnah.current_chapter, ending_mishnah, offset_ending + 1)) else: # match ended at end of a mishnah if i == mishnah.number_left_in_chapter(): # if this is the last mishnah self.log.write(u"Reached end of mishnayot in chapter {}.\n".format(str(mishnah.current_chapter))) mishnayot_end = True else: mishnah.advance_pointer(mishnah.current_chapter, ending_mishnah + 1) self.log.write(u"Advanced to next Mishnah: {}, {}\n".format(mishnah.current_chapter, ending_mishnah + 1)) break match = [bavli.title, mishnah.current_chapter, starting_mishnah, ending_mishnah, starting_daf, starting_line, ending_daf, ending_line] if ending_mishnah is None: msg = u"saw unmatched Mishna in Talmud: {}\n".format(", ".join([str(m) for m in match])) self.log.write(msg) else: self.csv_writer.writerow(match) self.matched_count += 1 msg = u"Match! {}\n".format(", ".join([str(m) for m in match])) #print msg self.log.write(msg) else: self.log.write(u"Talmud Mishna start: {}\n - did not match next Mishna: {}".format(line, current_mishnah)) if perek_start: perek_start = False # Check for Hadran if u'\u05d4\u05d3\u05e8\u05df \u05e2\u05dc\u05da' in line: if not mishnayot_end: msg = u'Error: Mishna did not reach the end of chapter! ("{}", {}).\t{} remain.'.format(bavli.title, mishnah.current_chapter, mishnah.number_left_in_chapter() + 1) print msg self.log.write(msg + u"\n") self.error_log.write(msg + u"\n") for n in mishnah.remaining_mishnah_numbers(): self.unmatched_count += 1 self.csv_writer.writerow([bavli.title, mishnah.current_chapter, n]) self.log.write(u"End of perek: {} {} on {} {}\n".format(bavli.title, mishnah.current_chapter, bavli.get_current_line()[0], bavli.get_current_line()[1])) try: # Advance to next chapter, reset indicators next_chapter = self.get_next_bavli_chapter(bavli.title, mishnah.current_chapter) mishnah.advance_pointer(next_chapter) perek_start = True mishnayot_end = False except PointerException: self.log.write(u"End of book: {} {} on {} {}\n".format(bavli.title, mishnah.current_chapter, bavli.get_current_line()[0], bavli.get_current_line()[1])) break
def process_book(self, mishnah_title): bavli = TalmudVolume(re.sub(" ", "_", mishnah_title[8:])) mishnah = MishnahVolume(re.sub(" ", "_", mishnah_title)) perek_start = True mishnayot_end = False while bavli.has_more(): current_mishnah = mishnah.get_current_mishnah( ) if not mishnayot_end else "" (starting_daf, starting_line, line) = bavli.get_next_line() m = self.matni_re.match(line) if m or perek_start: # Match mishnah keyword self.log.write("Found Mishnah start at {}:{}\n{}\n".format( starting_daf, starting_line, line)) ending_daf = starting_daf ending_line = starting_line if perek_start and not m: # Perek starts with no "Mishna" - contents of `line` are fine pass else: if len(m.group( 2)) == 0: # bareword "Mishna" - get next line (ending_daf, ending_line, line) = bavli.get_next_line() elif len( m.group(2) ) > 0: # "Mishna" followed by content, twim off "Mishna" line = m.group(2) line = self.replace_roshei_tevot(line) if fuzz.partial_ratio(line, current_mishnah) > 60: self.log.write( "Matched a starting line in the Mishnah: {}\n{}\n". format(line, current_mishnah)) mishnah_line_match_length, mishnah_line_match_threshold, max_lines = self.get_match_thresholds( bavli.title, mishnah.current_chapter) starting_mishnah = mishnah.current_mishnah if mishnayot_end: msg = "Error: Found too many mishnayot in {} {}!\n".format( bavli.title, mishnah.current_chapter) print(msg) self.log.write(msg) self.error_log.write(msg) self.csv_writer.writerow([ bavli.title, mishnah.current_chapter, "?", "?", starting_daf, starting_line ]) lines_in_match = 1 while not self.gemarah_re.search( line ) and '\u05d4\u05d3\u05e8\u05df \u05e2\u05dc\u05da' not in line: # Check for 'Gemara' or 'Hadran' (foo, bar, line) = bavli.get_next_line() lines_in_match += 1 lines_to_get = max_lines if max_lines <= lines_in_match else lines_in_match (ending_daf, ending_line, last_bavli_segment ) = bavli.get_previous_lines(lines_to_get) last_bavli_segment = last_bavli_segment.strip( )[-mishnah_line_match_length:] # Open up Roshei Teivot last_bavli_segment = self.replace_roshei_tevot( last_bavli_segment) ending_mishnah = None for i in range(mishnah.number_left_in_chapter() + 1): m = mishnah.get_next_mishnah(i) assert len(last_bavli_segment) < len(m) (ratio, offset_start, offset_ending) = fuzz.partial_with_place( m, last_bavli_segment) if ratio < mishnah_line_match_threshold: mesg = "Failed to match last Talmud line to Mishnah: \n{}\n{}\n\n".format( last_bavli_segment, m) self.log.write(mesg) continue self.log.write( "Succeeded to match last Talmud line to Mishanh: \n{}\n{}\n\n" .format(last_bavli_segment, m)) ending_mishnah = mishnah.current_mishnah + i if offset_ending < len( m ) - self.end_of_mishnah_fudge_character_length: # Match ended in middle of a mishnah. Number at end is close-enough-to-end fudge factor. mishnah.advance_pointer(mishnah.current_chapter, ending_mishnah, offset_ending + 1) self.log.write( "Internal match {} in {}\n - advanced Mishnah offset: {}, {}, {}\n" .format(last_bavli_segment, m, mishnah.current_chapter, ending_mishnah, offset_ending + 1)) else: # match ended at end of a mishnah if i == mishnah.number_left_in_chapter( ): # if this is the last mishnah self.log.write( "Reached end of mishnayot in chapter {}.\n" .format(str(mishnah.current_chapter))) mishnayot_end = True else: mishnah.advance_pointer( mishnah.current_chapter, ending_mishnah + 1) self.log.write( "Advanced to next Mishnah: {}, {}\n". format(mishnah.current_chapter, ending_mishnah + 1)) break match = [ bavli.title, mishnah.current_chapter, starting_mishnah, ending_mishnah, starting_daf, starting_line, ending_daf, ending_line ] if ending_mishnah is None: msg = "saw unmatched Mishna in Talmud: {}\n".format( ", ".join([str(m) for m in match])) self.log.write(msg) else: self.csv_writer.writerow(match) self.matched_count += 1 msg = "Match! {}\n".format(", ".join( [str(m) for m in match])) #print msg self.log.write(msg) else: self.log.write( "Talmud Mishna start: {}\n - did not match next Mishna: {}" .format(line, current_mishnah)) if perek_start: perek_start = False # Check for Hadran if '\u05d4\u05d3\u05e8\u05df \u05e2\u05dc\u05da' in line: if not mishnayot_end: msg = 'Error: Mishna did not reach the end of chapter! ("{}", {}).\t{} remain.'.format( bavli.title, mishnah.current_chapter, mishnah.number_left_in_chapter() + 1) print(msg) self.log.write(msg + "\n") self.error_log.write(msg + "\n") for n in mishnah.remaining_mishnah_numbers(): self.unmatched_count += 1 self.csv_writer.writerow( [bavli.title, mishnah.current_chapter, n]) self.log.write("End of perek: {} {} on {} {}\n".format( bavli.title, mishnah.current_chapter, bavli.get_current_line()[0], bavli.get_current_line()[1])) try: # Advance to next chapter, reset indicators next_chapter = self.get_next_bavli_chapter( bavli.title, mishnah.current_chapter) mishnah.advance_pointer(next_chapter) perek_start = True mishnayot_end = False except PointerException: self.log.write("End of book: {} {} on {} {}\n".format( bavli.title, mishnah.current_chapter, bavli.get_current_line()[0], bavli.get_current_line()[1])) break
def process_book(bavli, mishnah, csv_writer): perek_start = True mishnayot_end = False while bavli.has_more(): current_mishnah = mishnah.get_current_mishnah() if not mishnayot_end else "" (starting_daf, starting_line, line) = bavli.get_next_line() m = matni_re.match(line) if m or perek_start: # Match mishnah keyword log.write(u"Found Mishnah start at {}:{}\n{}\n".format(starting_daf, starting_line, line)) if perek_start or len(m.group(2)) > 6: ending_daf = starting_daf ending_line = starting_line if m: line = m.group(2) else: (ending_daf, ending_line, line) = bavli.get_next_line() if fuzz.partial_ratio(line, current_mishnah) > 60: log.write(u"Matched a starting line in the Mishnah: {}\n{}\n".format(line, current_mishnah)) starting_mishnah = mishnah.current_mishnah if mishnayot_end: msg = u"Error: Found too many mishnayot in {} {}!\n".format(bavli.title, mishnah.current_chapter) print msg log.write(msg) error_log.write(msg) while not gemarah_re.search(line): (foo, bar, line) = bavli.get_next_line() (ending_daf, ending_line, previous_line) = bavli.get_previous_line() (foo, bar, previous_previous_line) = bavli.get_previous_line(2) last_bavli_segment = previous_previous_line.strip() + u" " + previous_line.strip() last_bavli_segment = last_bavli_segment[-30:-1] ending_mishnah = None for i in range(mishnah.number_left_in_chapter() + 1): m = mishnah.get_next_mishnah(i) assert len(last_bavli_segment) < len(m) (ratio, offset_start, offset_ending) = fuzz.partial_with_place(m, last_bavli_segment) if ratio < 60: log.write(u"Failed to match last Talmud line to Mishnah: {}\n{}\n".format(last_bavli_segment, m)) error_log.write(u"Failed to match last Talmud line to Mishnah: {}\n{}\n".format(last_bavli_segment, m)) continue log.write(u"Succeeded to match last Talmud line to Mishanh: {}\n{}\n".format(last_bavli_segment, m)) ending_mishnah = mishnah.current_mishnah + i if offset_ending < len(m) - 10: # Match ended in middle of a mishnah. Number at end is close-enough-to-end fudge factor. mishnah.advance_pointer(mishnah.current_chapter, ending_mishnah, offset_ending + 1) log.write(u"Advanced Mishnah offset: {}, {}, {}\n".format(mishnah.current_chapter, ending_mishnah, offset_ending + 1)) else: # match ended at end of a mishnah if i == mishnah.number_left_in_chapter(): # if this is the last mishnah log.write(u"Reached end of mishnayot in chapter {} is {}\n".format(str(mishnah.current_chapter), str(len(mishnah.get_current_chapter_text())))) mishnayot_end = True else: mishnah.advance_pointer(mishnah.current_chapter, ending_mishnah + 1) log.write(u"Advanced to next Mishnah: {}, {}\n".format(mishnah.current_chapter, ending_mishnah + 1)) break match = [bavli.title, mishnah.current_chapter, starting_mishnah, ending_mishnah, starting_daf, starting_line, ending_daf, ending_line] if ending_mishnah is None: msg = u"saw unmatched Mishna in Talmud: {}\n".format(", ".join([str(m) for m in match])) print msg log.write(msg) error_log.write(msg) else: print csv_writer.writerow(match) msg = u"Match! {}\n".format(", ".join([str(m) for m in match])) print msg log.write(msg) if perek_start == True: perek_start = False if u'\u05d4\u05d3\u05e8\u05df \u05e2\u05dc\u05da' in line: if mishnayot_end == False: msg = u"Error: Mishna did not reach the end of chapter! {} {}\n".format(mishnah.title,mishnah.current_chapter) print msg log.write(msg) log.write(u"End of perek: {} {} on {} {}\n".format(bavli.title, mishnah.current_chapter, bavli.get_current_line()[0], bavli.get_current_line()[1])) try: mishnah.advance_pointer(mishnah.current_chapter + 1) perek_start = True mishnayot_end = False except PointerException: log.write(u"End of book: {} {} on {} {}\n".format(bavli.title, mishnah.current_chapter, bavli.get_current_line()[0], bavli.get_current_line()[1])) break