Пример #1
0
def post_rashi(en_text, he_text, title, server):
    if "II" in title:
        title = "Rashi on II Kings"
    else:
        title = "Rashi on I Kings"
    for lang, text in [("en", en_text), ("he", he_text)]:
        for ch_num in text.keys():
            text[ch_num] = convertDictToArray(text[ch_num])
        text = convertDictToArray(text)
        send_text = {
            "text": text,
            "language": lang,
            "versionTitle": "Metsudah {} -- {}".format(title, lang),
            "versionSource": "http://www.sefaria.org"
        }
        post_text(title, send_text, server=server)
Пример #2
0
def check_all_mishnayot_present_and_post(text, sefer, file_path):
    def post_(text, path):
        send_text = {
            "language": "en",
            "text": text,
            "versionTitle": "Mishnah Yomit",
            "versionSource": "http://learn.conservativeyeshiva.org/mishnah/"
        }
        #post_text(path, send_text, server=SERVER)

    #first check that all chapters present
    index = library.get_index("Mishnah " + sefer)
    en_title = "Mishnah Yomit on {}".format(index.title)
    translation = dict(text)
    for ch in text.keys():
        if ch == "Introduction":
            post_(text[ch], "{}, Introduction".format(en_title))
            text.pop(ch)
            translation.pop(ch)
            continue
        actual_mishnayot = [
            el.sections[1] for el in Ref("Mishnah {} {}".format(
                sefer, ch)).all_segment_refs()
        ]
        our_mishnayot = text[ch].keys()
        if our_mishnayot != actual_mishnayot:
            actual_mishnayot = set(actual_mishnayot)
            our_mishnayot = set(our_mishnayot)
            missing = actual_mishnayot - our_mishnayot
            wrong = our_mishnayot - actual_mishnayot
            print file_path
            print "Sefer: {}, Chapter: {}".format(sefer, ch)
            print "Mishnayot to check: {}".format(list(missing.union(wrong)))
            print
        text[ch] = zip(*convertDictToArray(text[ch], empty=("", "")))
        translation[ch] = list(text[ch][1])
        text[ch] = list(text[ch][0])
        while "" in text[ch]:
            i = text[ch].index("")
            text[ch][i] = []
    text = convertDictToArray(text)
    translation = convertDictToArray(translation)
    post_(text, en_title)
    for ch, chapter in enumerate(translation):
        for m, mishnah in enumerate(chapter):
            translation[ch][m] = " ".join(mishnah)
    post_(translation, index.title)
Пример #3
0
def parse_boaz(input_file):

    expression = u'@00(?:\u05e4\u05e8\u05e7 |\u05e4")([\u05d0-\u05ea"]{1,3})'

    simple_parse = file_to_ja([[]], input_file, [expression], boaz_align)

    # reset file
    input_file.seek(0)

    headers = [functions.getGematria(x) for x in grab_section_names(expression, input_file, 1)]

    comp_parse = simple_to_complex(headers, simple_parse.array())

    full_parse = functions.convertDictToArray(comp_parse)

    return full_parse
Пример #4
0
def create_index_and_post(full_intro, full_text):
    root = SchemaNode()
    root.add_primary_titles("Haflaah on Ketubot", u"הפלאה על כתובות")
    root.key = "haflaah"
    intro = JaggedArrayNode()
    intro.add_shared_term("Introduction")
    intro.add_structure(["Comment"])
    intro.key = "intro"
    root.append(intro)
    default = JaggedArrayNode()
    default.add_structure(["Daf", "Paragraph"],
                          address_types=["Talmud", "Integer"])
    default.default = True
    default.key = "default"
    root.append(default)
    root.validate()
    index = {
        "schema": root.serialize(),
        "title": "Haflaah on Ketubot",
        "categories": ["Talmud", "Bavli", "Commentary"],
        "dependence": "Commentary",
        "base_text_titles": ["Ketubot"]
    }
    post_index(index, server=SEFARIA_SERVER)

    full_text = convertDictToArray(full_text)
    send_text = {
        "text": full_intro,
        "language": "he",
        "versionSource":
        "http://aleph.nli.org.il:80/F/?func=direct&doc_number=001880789&local_base=NNL01",
        "versionTitle": "Sefer Hafla'ah, Lemberg, 1860"
    }
    post_text("Haflaah on Ketubot, Introduction",
              send_text,
              server=SEFARIA_SERVER)

    send_text = {
        "text": full_text,
        "language": "he",
        "versionSource":
        "http://aleph.nli.org.il:80/F/?func=direct&doc_number=001880789&local_base=NNL01",
        "versionTitle": "Sefer Hafla'ah, Lemberg, 1860"
    }
    post_text("Haflaah on Ketubot", send_text, server=SEFARIA_SERVER)
Пример #5
0
def parse_boaz(input_file):

    expression = u'@00(?:\u05e4\u05e8\u05e7 |\u05e4")([\u05d0-\u05ea"]{1,3})'

    simple_parse = file_to_ja([[]], input_file, [expression], boaz_align)

    # reset file
    input_file.seek(0)

    headers = [
        functions.getGematria(x)
        for x in grab_section_names(expression, input_file, 1)
    ]

    comp_parse = simple_to_complex(headers, simple_parse.array())

    full_parse = functions.convertDictToArray(comp_parse)

    return full_parse
Пример #6
0
def create_index_and_post(full_intro, full_text):
    root = SchemaNode()
    root.add_primary_titles("Haflaah on Ketubot", u"הפלאה על כתובות")
    root.key = "haflaah"
    intro = JaggedArrayNode()
    intro.add_shared_term("Introduction")
    intro.add_structure(["Comment"])
    intro.key = "intro"
    root.append(intro)
    default = JaggedArrayNode()
    default.add_structure(["Daf", "Paragraph"], address_types=["Talmud", "Integer"])
    default.default = True
    default.key = "default"
    root.append(default)
    root.validate()
    index = {
        "schema": root.serialize(),
        "title": "Haflaah on Ketubot",
        "categories": ["Talmud", "Bavli", "Commentary"],
        "dependence": "Commentary",
        "base_text_titles": ["Ketubot"]
    }
    post_index(index, server=SEFARIA_SERVER)

    full_text = convertDictToArray(full_text)
    send_text = {
        "text": full_intro,
        "language": "he",
        "versionSource": "http://aleph.nli.org.il:80/F/?func=direct&doc_number=001880789&local_base=NNL01",
        "versionTitle": "Sefer Hafla'ah, Lemberg, 1860"
    }
    post_text("Haflaah on Ketubot, Introduction", send_text, server=SEFARIA_SERVER)

    send_text = {
        "text": full_text,
        "language": "he",
        "versionSource": "http://aleph.nli.org.il:80/F/?func=direct&doc_number=001880789&local_base=NNL01",
        "versionTitle": "Sefer Hafla'ah, Lemberg, 1860"
    }
    post_text("Haflaah on Ketubot", send_text, server=SEFARIA_SERVER)
Пример #7
0
def parse_text():
    """
    Takes the result of strip_tags() and parses into a level four data structure for easy upload

    :return: Dictionary of books, depth 4.
    """

    # initiate data structure and variables
    full_text, chapters, verses, raw_text = {}, {}, {}, u''
    current_book, current_chapter, current_verse = u'', u'', u''

    to_parse = codecs.open('chizkuni_no-tags.txt', 'r', 'utf-8')

    for line in to_parse:

        # if new book add book to full_text.
        if line.find(u'<book>') != -1:

            # if this is the first book, do nothing
            if current_book != u'':

                # set up book and add it to full_text
                verses[current_verse] = process_verse(raw_text)
                chapters[current_chapter] = convertDictToArray(verses)
                full_text[current_book] = convertDictToArray(chapters)

                # reset verses and chapters
                chapters, verses, raw_text = {}, {} ,u''
                current_chapter, current_verse = u'', u''

            # save the next book as current_book
            current_book = removeAllStrings([u'\n', u'\r', u' '], to_parse.readline())

        # if new chapter, add verses to previous chapter
        elif line.find(u'<perek>') != -1:

            # if first chapter, set current chapter but do nothing else
            if current_chapter != u'':

                verses[current_verse] = process_verse(raw_text)
                chapters[current_chapter] = convertDictToArray(verses)
                verses, raw_text = {}, u''

            # get next chapter number
            current_chapter = removeAllStrings([u'.', u'\n'], to_parse.readline())
            current_chapter = decode_hebrew_numeral(current_chapter)
            current_verse = u''

        # if new verse, process raw text and add to verses
        elif line.find(u'<pasuk>') != -1:

            # add previous verse if not first verse
            if current_verse != u'':
                verses[current_verse] = process_verse(raw_text)
                raw_text = u''

            # get next verse number
            current_verse = removeAllStrings([u'.', u'\n'], to_parse.readline())
            current_verse = decode_hebrew_numeral(current_verse)

        # don't include parsha tags
        elif line.find(u'<parsha>') != -1:
            continue

        else:

            # add to raw text
            raw_text += line

    # add final book
    verses[current_verse] = process_verse(raw_text)
    chapters[current_chapter] = convertDictToArray(verses)
    full_text[current_book] = convertDictToArray(chapters)

    to_parse.close()
    return full_text
Пример #8
0
    other_files = [
        file for file in files
        if file not in bad_linking_files and file not in bad_section_files
    ]
    title = "Maharam"
    heTitle = u"""מהר"ם"""
    start = False

    #files = bad_section_files + bad_linking_files
    #files = ["makkot2.txt"]
    for file in ["avodah zarah2.txt"]:
        print file
        masechet = file.replace("2.txt", "").title()
        obj = Maharsha(masechet, title, heTitle, "http://proto.sefaria.org")
        len_masechet = len(Ref(masechet).text('he').text)
        obj.parseText(open(file), len_masechet)
        if len(obj.comm_dict) > 0:
            obj.create_index(masechet)
            text_to_post = convertDictToArray(obj.comm_dict)
            send_text = {
                "versionTitle": "Vilna Edition",
                "versionSource":
                "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001300957",
                "language": "he",
                "text": text_to_post,
            }
            post_text("{} on {}".format(title, masechet),
                      send_text,
                      "on",
                      server=obj.server)
            obj.postLinks(masechet)
Пример #9
0
                        roman_num = int(roman_num)
                    if roman_num != ch_num + 1:
                        text[book_n] = "Error at Chapter {}".format(roman_num)
                        continue

                    lines = [
                        check_for_footnote(line, footnotes[book_n])
                        for line_n, line in enumerate(chapter.contents)
                        if line != "\n"
                    ]
                    lines = [(str(i + 1) + ". " + line)
                             for i, line in enumerate(lines)]
                    text[book_n][roman_num] = lines
                    first_line = text[book_n][roman_num][0]
                    text[book_n][roman_num][
                        0] = u"<b>" + chapter_header + u"</b><br/><br/>" + first_line
            text[book_n] = convertDictToArray(text[book_n])

    create_index()
    text = convertDictToArray(text)

    body_text = {
        "text": text,
        "language": "en",
        "versionTitle": "The War of the Jews, translated by William Whiston",
        "versionSource": "https://en.wikisource.org/wiki/The_War_of_the_Jews"
    }
    post_text("The War of the Jews",
              body_text,
              server="http://proto.sefaria.org")
Пример #10
0
def align_footnotes(books):
    """
    The footnotes need to be structured by book and chapter. Each footnote may refer to multiple verses.

    :param books: Dictionary, containing the entire JPS 1985 translation
    return: Dictionary, with books as keys and chapters as values. Each chapter is a list of dictionaries,
    with the key "footnote" set to the footnote and the key "links" being a list of verses where the
    footnote appears.
    """

    jps_footnotes = {}

    # define replacement dictionary
    replacements = {
        u'@': u'\u1e63',
        u'h%': u'\u1e25',
        u'H%': u'\u1e24',
        u'\n': u'',
        u'\r': u'',
        u'\n\r': u'',
    }

    # get list of book in tanach
    all_books = library.get_indexes_in_category('Tanach')

    # open footnote document and retrieve first footnote
    input_file = codecs.open('JPS1985_footnotes.txt', 'r', 'utf-8')
    note = {'footnote': functions.multiple_replace(input_file.readline(), replacements)}

    # iterate through Tanach
    for book in all_books:

        # set dictionary to with keys set to chapter numbers
        footnote_chaps = {}

        for chap_num, chapter in enumerate(books[book]):

            # set flag to indicate if any footnote markers have been found
            found_note = False

            # declare array to hold all note in chapter
            chap_notes = []

            # repeatedly loop through chapter, searching for cases where footnote appears
            while True:

                # account for the case where a footnote is tagged "aa"
                if note['footnote'].find(u'aa') == 0:
                    tag = u'aa'
                else:
                    tag = note['footnote'][0]

                found = []
                for verse_num, verse in enumerate(chapter):
                    if verse.find(u'[{}]'.format(tag)) != -1:
                        found.append(verse_num+1)
                        found_note = True
                    note['links'] = found

                # if footnote markers were found, get the next footnote
                if found_note:
                    chap_notes.append(note)
                    note = {'footnote': functions.multiple_replace(input_file.readline(), replacements)}

                    if note['footnote'] == u'':
                        footnote_chaps[chap_num] = chap_notes
                        break

                # if footnote begins with "a", this is a new chapter
                try:
                    if note['footnote'][0] == u'a' and note['footnote'][1] != u'a':
                        footnote_chaps[chap_num] = chap_notes
                        break

                except IndexError:
                    print 'error'
                    print note['footnote']
                    print u'{}, chapter {}'.format(book, chap_num+1)
                    input_file.close()
                    sys.exit(1)

        jps_footnotes[book] = functions.convertDictToArray(footnote_chaps)

    input_file.close()
    return jps_footnotes
Пример #11
0
            ftnotes = get_ftnotes(soup)
            lines = [render(p, ftnotes) for p in soup.find_all("p")]
            for line_n, line in enumerate(lines):
                if line.lower().startswith("chapter") and len(
                        line.split()) == 2:
                    ch_num_should_be = get_ch_num(line)
                    ch_num += 1
                    if ch_num_should_be != ch_num:
                        print "{} vs {}".format(ch_num_should_be, ch_num)
                    text[ch_num_should_be] = []
                    continue
                elif ch_num == 0:  #skip lines before Chapters start
                    continue
                if line.replace(" ", "") != "":
                    text[ch_num_should_be].append(line)
        text = convertDictToArray(text)
        books[count] = text
    books = convertDictToArray(books)
    send_text = {
        "text":
        books,
        "language":
        "en",
        "versionTitle":
        "Wikisource",
        "versionSource":
        "https://en.wikisource.org/wiki/The_Antiquities_of_the_Jews"
    }
    post_index(index, server="http://proto.sefaria.org")

    post_text("The Antiquities of the Jews",
Пример #12
0
def align_footnotes(books):
    """
    The footnotes need to be structured by book and chapter. Each footnote may refer to multiple verses.

    :param books: Dictionary, containing the entire JPS 1985 translation
    return: Dictionary, with books as keys and chapters as values. Each chapter is a list of dictionaries,
    with the key "footnote" set to the footnote and the key "links" being a list of verses where the
    footnote appears.
    """

    jps_footnotes = {}

    # define replacement dictionary
    replacements = {
        u'@': u'\u1e63',
        u'h%': u'\u1e25',
        u'H%': u'\u1e24',
        u'\n': u'',
        u'\r': u'',
        u'\n\r': u'',
    }

    # get list of book in tanach
    all_books = library.get_indexes_in_category('Tanach')

    # open footnote document and retrieve first footnote
    input_file = codecs.open('JPS1985_footnotes.txt', 'r', 'utf-8')
    note = {
        'footnote': functions.multiple_replace(input_file.readline(),
                                               replacements)
    }

    # iterate through Tanach
    for book in all_books:

        # set dictionary to with keys set to chapter numbers
        footnote_chaps = {}

        for chap_num, chapter in enumerate(books[book]):

            # set flag to indicate if any footnote markers have been found
            found_note = False

            # declare array to hold all note in chapter
            chap_notes = []

            # repeatedly loop through chapter, searching for cases where footnote appears
            while True:

                # account for the case where a footnote is tagged "aa"
                if note['footnote'].find(u'aa') == 0:
                    tag = u'aa'
                else:
                    tag = note['footnote'][0]

                found = []
                for verse_num, verse in enumerate(chapter):
                    if verse.find(u'[{}]'.format(tag)) != -1:
                        found.append(verse_num + 1)
                        found_note = True
                    note['links'] = found

                # if footnote markers were found, get the next footnote
                if found_note:
                    chap_notes.append(note)
                    note = {
                        'footnote':
                        functions.multiple_replace(input_file.readline(),
                                                   replacements)
                    }

                    if note['footnote'] == u'':
                        footnote_chaps[chap_num] = chap_notes
                        break

                # if footnote begins with "a", this is a new chapter
                try:
                    if note['footnote'][
                            0] == u'a' and note['footnote'][1] != u'a':
                        footnote_chaps[chap_num] = chap_notes
                        break

                except IndexError:
                    print 'error'
                    print note['footnote']
                    print u'{}, chapter {}'.format(book, chap_num + 1)
                    input_file.close()
                    sys.exit(1)

        jps_footnotes[book] = functions.convertDictToArray(footnote_chaps)

    input_file.close()
    return jps_footnotes
Пример #13
0
            print f
            soup = BeautifulSoup(file, 'lxml')
            ftnotes = get_ftnotes(soup)
            lines = [render(p, ftnotes) for p in soup.find_all("p")]
            for line_n, line in enumerate(lines):
                if line.lower().startswith("chapter") and len(line.split()) == 2:
                    ch_num_should_be = get_ch_num(line)
                    ch_num += 1
                    if ch_num_should_be != ch_num:
                        print "{} vs {}".format(ch_num_should_be, ch_num)
                    text[ch_num_should_be] = []
                    continue
                elif ch_num == 0: #skip lines before Chapters start
                    continue
                if line.replace(" ", "") != "":
                    text[ch_num_should_be].append(line)
        text = convertDictToArray(text)
        books[count] = text
    books = convertDictToArray(books)
    send_text = {
        "text": books,
        "language": "en",
        "versionTitle": "Wikisource",
        "versionSource": "https://en.wikisource.org/wiki/The_Antiquities_of_the_Jews"
    }
    post_index(index, server="http://proto.sefaria.org")

    post_text("The Antiquities of the Jews", send_text, server="http://proto.sefaria.org")


Пример #14
0
def check_all_mishnayot_present_and_post(text, sefer, file_path, post=False):
    versionTitle = "Mishnah Yomit"
    sefer = convert_spellings(sefer)

    def post_(text, path):
        send_text = {
            "language": "en",
            "text": text,
            "versionTitle": versionTitle,
            "versionSource": "http://learn.conservativeyeshiva.org/mishnah/"
        }
        try:
            if post:
                print SERVER
                post_text(path, send_text, server=SERVER)
        except UnicodeDecodeError:
            for ch_num, chapter in enumerate(text):
                for mishnah_num, mishnah in enumerate(chapter):
                    for comm_num, comment in enumerate(mishnah):
                        send_text = {
                            "language":
                            "en",
                            "text":
                            comment,
                            "versionTitle":
                            versionTitle,
                            "versionSource":
                            "http://learn.conservativeyeshiva.org/mishnah/"
                        }
                        try:
                            if post:
                                post_text("{} {}:{}:{}".format(
                                    path, ch_num + 1, mishnah_num + 1,
                                    comm_num + 1),
                                          send_text,
                                          server=SERVER)
                        except UnicodeDecodeError:
                            print "Error posting {}".format(
                                "{} {}:{}:{}".format(path, ch_num + 1,
                                                     mishnah_num + 1,
                                                     comm_num + 1))

    #first check that all chapters present
    sefer = convert_spellings(sefer)
    index = library.get_index("Mishnah " + sefer)
    en_title = "Mishnah Yomit on {}".format(index.title)
    translation = dict(text)
    for ch in text.keys():
        if ch == "Introduction":
            #if post:
            #    post_(text[ch], "{}, Introduction".format(en_title))
            text.pop(ch)
            translation.pop(ch)
            continue
        actual_mishnayot = [
            el.sections[1] for el in Ref("Mishnah {} {}".format(
                sefer, ch)).all_segment_refs()
        ]
        our_mishnayot = text[ch].keys()
        if len(actual_mishnayot) > len(our_mishnayot):
            actual_mishnayot = set(actual_mishnayot)
            our_mishnayot = set(our_mishnayot)
            missing = actual_mishnayot - our_mishnayot
        text[ch] = zip(*convertDictToArray(text[ch], empty=("", "")))
        translation[ch] = list(text[ch][1])
        text[ch] = list(text[ch][0])
        while "" in text[ch]:
            i = text[ch].index("")
            text[ch][i] = []
    text = convertDictToArray(text)
    translation = convertDictToArray(translation)
    #if post:
    #    post_(text, en_title)
    for ch, chapter in enumerate(translation):
        for m, mishnah in enumerate(chapter):
            translation[ch][m] = " ".join(mishnah)
    if post:
        post_(translation, index.title)
Пример #15
0
                while not relevant_text(title).startswith("Chapter") and not relevant_text(title) == "Footnotes":
                    title = title.previous

                if title.startswith("Chapter"):
                    roman_num = title.split()[-1]
                    if not roman_num.isdigit():
                        roman_num = roman_to_int(roman_num.encode('utf-8'))
                    else:
                        roman_num = int(roman_num)
                    if roman_num != ch_num+1:
                        text[book_n] = "Error at Chapter {}".format(roman_num)
                        continue

                    lines = [check_for_footnote(line, footnotes[book_n]) for line_n, line in enumerate(chapter.contents) if line != "\n"]
                    lines = [(str(i+1) + ". " + line) for i, line in enumerate(lines)]
                    text[book_n][roman_num] = lines
                    first_line = text[book_n][roman_num][0]
                    text[book_n][roman_num][0] = u"<b>"+chapter_header+u"</b><br/><br/>"+first_line
            text[book_n] = convertDictToArray(text[book_n])

    create_index()
    text = convertDictToArray(text)

    body_text = {
        "text": text,
        "language": "en",
        "versionTitle": "The War of the Jews, translated by William Whiston",
        "versionSource": "https://en.wikisource.org/wiki/The_War_of_the_Jews"
    }
    post_text("The War of the Jews", body_text, server="http://proto.sefaria.org")