def post_rashi(en_text, he_text, title, server): if "II" in title: title = "Rashi on II Kings" else: title = "Rashi on I Kings" for lang, text in [("en", en_text), ("he", he_text)]: for ch_num in text.keys(): text[ch_num] = convertDictToArray(text[ch_num]) text = convertDictToArray(text) send_text = { "text": text, "language": lang, "versionTitle": "Metsudah {} -- {}".format(title, lang), "versionSource": "http://www.sefaria.org" } post_text(title, send_text, server=server)
def check_all_mishnayot_present_and_post(text, sefer, file_path): def post_(text, path): send_text = { "language": "en", "text": text, "versionTitle": "Mishnah Yomit", "versionSource": "http://learn.conservativeyeshiva.org/mishnah/" } #post_text(path, send_text, server=SERVER) #first check that all chapters present index = library.get_index("Mishnah " + sefer) en_title = "Mishnah Yomit on {}".format(index.title) translation = dict(text) for ch in text.keys(): if ch == "Introduction": post_(text[ch], "{}, Introduction".format(en_title)) text.pop(ch) translation.pop(ch) continue actual_mishnayot = [ el.sections[1] for el in Ref("Mishnah {} {}".format( sefer, ch)).all_segment_refs() ] our_mishnayot = text[ch].keys() if our_mishnayot != actual_mishnayot: actual_mishnayot = set(actual_mishnayot) our_mishnayot = set(our_mishnayot) missing = actual_mishnayot - our_mishnayot wrong = our_mishnayot - actual_mishnayot print file_path print "Sefer: {}, Chapter: {}".format(sefer, ch) print "Mishnayot to check: {}".format(list(missing.union(wrong))) print text[ch] = zip(*convertDictToArray(text[ch], empty=("", ""))) translation[ch] = list(text[ch][1]) text[ch] = list(text[ch][0]) while "" in text[ch]: i = text[ch].index("") text[ch][i] = [] text = convertDictToArray(text) translation = convertDictToArray(translation) post_(text, en_title) for ch, chapter in enumerate(translation): for m, mishnah in enumerate(chapter): translation[ch][m] = " ".join(mishnah) post_(translation, index.title)
def parse_boaz(input_file): expression = u'@00(?:\u05e4\u05e8\u05e7 |\u05e4")([\u05d0-\u05ea"]{1,3})' simple_parse = file_to_ja([[]], input_file, [expression], boaz_align) # reset file input_file.seek(0) headers = [functions.getGematria(x) for x in grab_section_names(expression, input_file, 1)] comp_parse = simple_to_complex(headers, simple_parse.array()) full_parse = functions.convertDictToArray(comp_parse) return full_parse
def create_index_and_post(full_intro, full_text): root = SchemaNode() root.add_primary_titles("Haflaah on Ketubot", u"הפלאה על כתובות") root.key = "haflaah" intro = JaggedArrayNode() intro.add_shared_term("Introduction") intro.add_structure(["Comment"]) intro.key = "intro" root.append(intro) default = JaggedArrayNode() default.add_structure(["Daf", "Paragraph"], address_types=["Talmud", "Integer"]) default.default = True default.key = "default" root.append(default) root.validate() index = { "schema": root.serialize(), "title": "Haflaah on Ketubot", "categories": ["Talmud", "Bavli", "Commentary"], "dependence": "Commentary", "base_text_titles": ["Ketubot"] } post_index(index, server=SEFARIA_SERVER) full_text = convertDictToArray(full_text) send_text = { "text": full_intro, "language": "he", "versionSource": "http://aleph.nli.org.il:80/F/?func=direct&doc_number=001880789&local_base=NNL01", "versionTitle": "Sefer Hafla'ah, Lemberg, 1860" } post_text("Haflaah on Ketubot, Introduction", send_text, server=SEFARIA_SERVER) send_text = { "text": full_text, "language": "he", "versionSource": "http://aleph.nli.org.il:80/F/?func=direct&doc_number=001880789&local_base=NNL01", "versionTitle": "Sefer Hafla'ah, Lemberg, 1860" } post_text("Haflaah on Ketubot", send_text, server=SEFARIA_SERVER)
def parse_boaz(input_file): expression = u'@00(?:\u05e4\u05e8\u05e7 |\u05e4")([\u05d0-\u05ea"]{1,3})' simple_parse = file_to_ja([[]], input_file, [expression], boaz_align) # reset file input_file.seek(0) headers = [ functions.getGematria(x) for x in grab_section_names(expression, input_file, 1) ] comp_parse = simple_to_complex(headers, simple_parse.array()) full_parse = functions.convertDictToArray(comp_parse) return full_parse
def parse_text(): """ Takes the result of strip_tags() and parses into a level four data structure for easy upload :return: Dictionary of books, depth 4. """ # initiate data structure and variables full_text, chapters, verses, raw_text = {}, {}, {}, u'' current_book, current_chapter, current_verse = u'', u'', u'' to_parse = codecs.open('chizkuni_no-tags.txt', 'r', 'utf-8') for line in to_parse: # if new book add book to full_text. if line.find(u'<book>') != -1: # if this is the first book, do nothing if current_book != u'': # set up book and add it to full_text verses[current_verse] = process_verse(raw_text) chapters[current_chapter] = convertDictToArray(verses) full_text[current_book] = convertDictToArray(chapters) # reset verses and chapters chapters, verses, raw_text = {}, {} ,u'' current_chapter, current_verse = u'', u'' # save the next book as current_book current_book = removeAllStrings([u'\n', u'\r', u' '], to_parse.readline()) # if new chapter, add verses to previous chapter elif line.find(u'<perek>') != -1: # if first chapter, set current chapter but do nothing else if current_chapter != u'': verses[current_verse] = process_verse(raw_text) chapters[current_chapter] = convertDictToArray(verses) verses, raw_text = {}, u'' # get next chapter number current_chapter = removeAllStrings([u'.', u'\n'], to_parse.readline()) current_chapter = decode_hebrew_numeral(current_chapter) current_verse = u'' # if new verse, process raw text and add to verses elif line.find(u'<pasuk>') != -1: # add previous verse if not first verse if current_verse != u'': verses[current_verse] = process_verse(raw_text) raw_text = u'' # get next verse number current_verse = removeAllStrings([u'.', u'\n'], to_parse.readline()) current_verse = decode_hebrew_numeral(current_verse) # don't include parsha tags elif line.find(u'<parsha>') != -1: continue else: # add to raw text raw_text += line # add final book verses[current_verse] = process_verse(raw_text) chapters[current_chapter] = convertDictToArray(verses) full_text[current_book] = convertDictToArray(chapters) to_parse.close() return full_text
other_files = [ file for file in files if file not in bad_linking_files and file not in bad_section_files ] title = "Maharam" heTitle = u"""מהר"ם""" start = False #files = bad_section_files + bad_linking_files #files = ["makkot2.txt"] for file in ["avodah zarah2.txt"]: print file masechet = file.replace("2.txt", "").title() obj = Maharsha(masechet, title, heTitle, "http://proto.sefaria.org") len_masechet = len(Ref(masechet).text('he').text) obj.parseText(open(file), len_masechet) if len(obj.comm_dict) > 0: obj.create_index(masechet) text_to_post = convertDictToArray(obj.comm_dict) send_text = { "versionTitle": "Vilna Edition", "versionSource": "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001300957", "language": "he", "text": text_to_post, } post_text("{} on {}".format(title, masechet), send_text, "on", server=obj.server) obj.postLinks(masechet)
roman_num = int(roman_num) if roman_num != ch_num + 1: text[book_n] = "Error at Chapter {}".format(roman_num) continue lines = [ check_for_footnote(line, footnotes[book_n]) for line_n, line in enumerate(chapter.contents) if line != "\n" ] lines = [(str(i + 1) + ". " + line) for i, line in enumerate(lines)] text[book_n][roman_num] = lines first_line = text[book_n][roman_num][0] text[book_n][roman_num][ 0] = u"<b>" + chapter_header + u"</b><br/><br/>" + first_line text[book_n] = convertDictToArray(text[book_n]) create_index() text = convertDictToArray(text) body_text = { "text": text, "language": "en", "versionTitle": "The War of the Jews, translated by William Whiston", "versionSource": "https://en.wikisource.org/wiki/The_War_of_the_Jews" } post_text("The War of the Jews", body_text, server="http://proto.sefaria.org")
def align_footnotes(books): """ The footnotes need to be structured by book and chapter. Each footnote may refer to multiple verses. :param books: Dictionary, containing the entire JPS 1985 translation return: Dictionary, with books as keys and chapters as values. Each chapter is a list of dictionaries, with the key "footnote" set to the footnote and the key "links" being a list of verses where the footnote appears. """ jps_footnotes = {} # define replacement dictionary replacements = { u'@': u'\u1e63', u'h%': u'\u1e25', u'H%': u'\u1e24', u'\n': u'', u'\r': u'', u'\n\r': u'', } # get list of book in tanach all_books = library.get_indexes_in_category('Tanach') # open footnote document and retrieve first footnote input_file = codecs.open('JPS1985_footnotes.txt', 'r', 'utf-8') note = {'footnote': functions.multiple_replace(input_file.readline(), replacements)} # iterate through Tanach for book in all_books: # set dictionary to with keys set to chapter numbers footnote_chaps = {} for chap_num, chapter in enumerate(books[book]): # set flag to indicate if any footnote markers have been found found_note = False # declare array to hold all note in chapter chap_notes = [] # repeatedly loop through chapter, searching for cases where footnote appears while True: # account for the case where a footnote is tagged "aa" if note['footnote'].find(u'aa') == 0: tag = u'aa' else: tag = note['footnote'][0] found = [] for verse_num, verse in enumerate(chapter): if verse.find(u'[{}]'.format(tag)) != -1: found.append(verse_num+1) found_note = True note['links'] = found # if footnote markers were found, get the next footnote if found_note: chap_notes.append(note) note = {'footnote': functions.multiple_replace(input_file.readline(), replacements)} if note['footnote'] == u'': footnote_chaps[chap_num] = chap_notes break # if footnote begins with "a", this is a new chapter try: if note['footnote'][0] == u'a' and note['footnote'][1] != u'a': footnote_chaps[chap_num] = chap_notes break except IndexError: print 'error' print note['footnote'] print u'{}, chapter {}'.format(book, chap_num+1) input_file.close() sys.exit(1) jps_footnotes[book] = functions.convertDictToArray(footnote_chaps) input_file.close() return jps_footnotes
ftnotes = get_ftnotes(soup) lines = [render(p, ftnotes) for p in soup.find_all("p")] for line_n, line in enumerate(lines): if line.lower().startswith("chapter") and len( line.split()) == 2: ch_num_should_be = get_ch_num(line) ch_num += 1 if ch_num_should_be != ch_num: print "{} vs {}".format(ch_num_should_be, ch_num) text[ch_num_should_be] = [] continue elif ch_num == 0: #skip lines before Chapters start continue if line.replace(" ", "") != "": text[ch_num_should_be].append(line) text = convertDictToArray(text) books[count] = text books = convertDictToArray(books) send_text = { "text": books, "language": "en", "versionTitle": "Wikisource", "versionSource": "https://en.wikisource.org/wiki/The_Antiquities_of_the_Jews" } post_index(index, server="http://proto.sefaria.org") post_text("The Antiquities of the Jews",
def align_footnotes(books): """ The footnotes need to be structured by book and chapter. Each footnote may refer to multiple verses. :param books: Dictionary, containing the entire JPS 1985 translation return: Dictionary, with books as keys and chapters as values. Each chapter is a list of dictionaries, with the key "footnote" set to the footnote and the key "links" being a list of verses where the footnote appears. """ jps_footnotes = {} # define replacement dictionary replacements = { u'@': u'\u1e63', u'h%': u'\u1e25', u'H%': u'\u1e24', u'\n': u'', u'\r': u'', u'\n\r': u'', } # get list of book in tanach all_books = library.get_indexes_in_category('Tanach') # open footnote document and retrieve first footnote input_file = codecs.open('JPS1985_footnotes.txt', 'r', 'utf-8') note = { 'footnote': functions.multiple_replace(input_file.readline(), replacements) } # iterate through Tanach for book in all_books: # set dictionary to with keys set to chapter numbers footnote_chaps = {} for chap_num, chapter in enumerate(books[book]): # set flag to indicate if any footnote markers have been found found_note = False # declare array to hold all note in chapter chap_notes = [] # repeatedly loop through chapter, searching for cases where footnote appears while True: # account for the case where a footnote is tagged "aa" if note['footnote'].find(u'aa') == 0: tag = u'aa' else: tag = note['footnote'][0] found = [] for verse_num, verse in enumerate(chapter): if verse.find(u'[{}]'.format(tag)) != -1: found.append(verse_num + 1) found_note = True note['links'] = found # if footnote markers were found, get the next footnote if found_note: chap_notes.append(note) note = { 'footnote': functions.multiple_replace(input_file.readline(), replacements) } if note['footnote'] == u'': footnote_chaps[chap_num] = chap_notes break # if footnote begins with "a", this is a new chapter try: if note['footnote'][ 0] == u'a' and note['footnote'][1] != u'a': footnote_chaps[chap_num] = chap_notes break except IndexError: print 'error' print note['footnote'] print u'{}, chapter {}'.format(book, chap_num + 1) input_file.close() sys.exit(1) jps_footnotes[book] = functions.convertDictToArray(footnote_chaps) input_file.close() return jps_footnotes
print f soup = BeautifulSoup(file, 'lxml') ftnotes = get_ftnotes(soup) lines = [render(p, ftnotes) for p in soup.find_all("p")] for line_n, line in enumerate(lines): if line.lower().startswith("chapter") and len(line.split()) == 2: ch_num_should_be = get_ch_num(line) ch_num += 1 if ch_num_should_be != ch_num: print "{} vs {}".format(ch_num_should_be, ch_num) text[ch_num_should_be] = [] continue elif ch_num == 0: #skip lines before Chapters start continue if line.replace(" ", "") != "": text[ch_num_should_be].append(line) text = convertDictToArray(text) books[count] = text books = convertDictToArray(books) send_text = { "text": books, "language": "en", "versionTitle": "Wikisource", "versionSource": "https://en.wikisource.org/wiki/The_Antiquities_of_the_Jews" } post_index(index, server="http://proto.sefaria.org") post_text("The Antiquities of the Jews", send_text, server="http://proto.sefaria.org")
def check_all_mishnayot_present_and_post(text, sefer, file_path, post=False): versionTitle = "Mishnah Yomit" sefer = convert_spellings(sefer) def post_(text, path): send_text = { "language": "en", "text": text, "versionTitle": versionTitle, "versionSource": "http://learn.conservativeyeshiva.org/mishnah/" } try: if post: print SERVER post_text(path, send_text, server=SERVER) except UnicodeDecodeError: for ch_num, chapter in enumerate(text): for mishnah_num, mishnah in enumerate(chapter): for comm_num, comment in enumerate(mishnah): send_text = { "language": "en", "text": comment, "versionTitle": versionTitle, "versionSource": "http://learn.conservativeyeshiva.org/mishnah/" } try: if post: post_text("{} {}:{}:{}".format( path, ch_num + 1, mishnah_num + 1, comm_num + 1), send_text, server=SERVER) except UnicodeDecodeError: print "Error posting {}".format( "{} {}:{}:{}".format(path, ch_num + 1, mishnah_num + 1, comm_num + 1)) #first check that all chapters present sefer = convert_spellings(sefer) index = library.get_index("Mishnah " + sefer) en_title = "Mishnah Yomit on {}".format(index.title) translation = dict(text) for ch in text.keys(): if ch == "Introduction": #if post: # post_(text[ch], "{}, Introduction".format(en_title)) text.pop(ch) translation.pop(ch) continue actual_mishnayot = [ el.sections[1] for el in Ref("Mishnah {} {}".format( sefer, ch)).all_segment_refs() ] our_mishnayot = text[ch].keys() if len(actual_mishnayot) > len(our_mishnayot): actual_mishnayot = set(actual_mishnayot) our_mishnayot = set(our_mishnayot) missing = actual_mishnayot - our_mishnayot text[ch] = zip(*convertDictToArray(text[ch], empty=("", ""))) translation[ch] = list(text[ch][1]) text[ch] = list(text[ch][0]) while "" in text[ch]: i = text[ch].index("") text[ch][i] = [] text = convertDictToArray(text) translation = convertDictToArray(translation) #if post: # post_(text, en_title) for ch, chapter in enumerate(translation): for m, mishnah in enumerate(chapter): translation[ch][m] = " ".join(mishnah) if post: post_(translation, index.title)
while not relevant_text(title).startswith("Chapter") and not relevant_text(title) == "Footnotes": title = title.previous if title.startswith("Chapter"): roman_num = title.split()[-1] if not roman_num.isdigit(): roman_num = roman_to_int(roman_num.encode('utf-8')) else: roman_num = int(roman_num) if roman_num != ch_num+1: text[book_n] = "Error at Chapter {}".format(roman_num) continue lines = [check_for_footnote(line, footnotes[book_n]) for line_n, line in enumerate(chapter.contents) if line != "\n"] lines = [(str(i+1) + ". " + line) for i, line in enumerate(lines)] text[book_n][roman_num] = lines first_line = text[book_n][roman_num][0] text[book_n][roman_num][0] = u"<b>"+chapter_header+u"</b><br/><br/>"+first_line text[book_n] = convertDictToArray(text[book_n]) create_index() text = convertDictToArray(text) body_text = { "text": text, "language": "en", "versionTitle": "The War of the Jews, translated by William Whiston", "versionSource": "https://en.wikisource.org/wiki/The_War_of_the_Jews" } post_text("The War of the Jews", body_text, server="http://proto.sefaria.org")