Exemplo n.º 1
0
def do_book_comparison(book_index_xml):
    #check number of chapters.
    #compare to the parsed Leningrad itself
    #compare to the current sefaria versions.

    canonical_name = book_index_xml.find('./names/name').text
    print canonical_name
    diff_file = open("results/%s_wlc_koren.html" % canonical_name, 'wb+')
    length_results = open("results/length_comparison.txt", 'ab+')

    wlc_chapter_count = int(book_index_xml.find(
        './cs').text)  #listed length of the leningrad chapters
    with open("preprocess_json/%s.json" % canonical_name, 'r') as filep:
        wlc_text = json.load(filep)['text']
    wlc_real_chapter_count = len(
        wlc_text)  #physical length of the parsed leningrad chapters

    sefaria_book = Helper.getKnownTexts(canonical_name)
    sefria_chapter_count = sefaria_book['lengths'][0]
    sefaria_text = Helper.api_get_text(
        "%s 1-%s" % (sefaria_book['title'], sefria_chapter_count), 'he',
        "Tanach with Ta'amei Hamikra")['he']
    if sefria_chapter_count == 1:
        sefaria_text = [sefaria_text]
    sefaria_real_chapter_count = len(sefaria_text)

    if not all_same([
            wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count,
            sefaria_real_chapter_count
    ]):
        ch_res_str = "%s: Leningrad has %s chapters listed and %s chapters in text. Sefaria version has %s chapters listed and %s chapters in text\n" % (
            canonical_name.encode('utf-8'), wlc_chapter_count,
            wlc_real_chapter_count, sefria_chapter_count,
            sefaria_real_chapter_count)
        length_results.write(ch_res_str)

    for chapter in book_index_xml.findall('c'):
        ch_num = int(chapter.get('n'))
        wlc_verse_count = int(chapter.find('vs').text)
        wlc_real_verse_count = len(wlc_text[ch_num - 1])
        sefaria_real_verse_count = len(sefaria_text[ch_num - 1])
        if not all_same(
            [wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count]):
            v_res_str = "%s:%s Leningrad has %s verses listed and %s verses in text. Sefaria version has %s verses\n" % (
                canonical_name.encode('utf-8'), ch_num, wlc_verse_count,
                wlc_real_verse_count, sefaria_real_verse_count)
            length_results.write(v_res_str)

    html_diff = difflib.HtmlDiff().make_file(
        flatten_text(make_consonantal_text(wlc_text)),
        flatten_text(make_consonantal_text(sefaria_text)), 'Leningrad Codex',
        'Sefaria/Koren')
    html_diff = html_diff.replace('charset=ISO-8859-1', 'charset=utf-8')
    diff_file.write(html_diff.encode('utf-8'))

    length_results.close()
    diff_file.close()
def run_post_to_api(sub_directory=None):
    directory = "preprocess_json/%s" % sub_directory if sub_directory else "preprocess_json"
    wlc_index_xml = ET.parse("source/TanachIndex.xml")  # this lists num of chapters and verses for all books in the WLC
    books_xml_r = wlc_index_xml.getroot().find("tanach")
    for book in books_xml_r.findall("book"):
        canonical_name = book.find("./names/name").text
        print canonical_name
        with open("%s/%s.json" % (directory, canonical_name), "r") as filep:
            file_text = filep.read()
        sefaria_book = Helper.getKnownTexts(canonical_name)
        Helper.postText(sefaria_book["title"], file_text, False)
Exemplo n.º 3
0
def run_post_to_api(sub_directory=None):
    directory = "preprocess_json/%s" % sub_directory if sub_directory else "preprocess_json"
    wlc_index_xml = ET.parse(
        'source/TanachIndex.xml'
    )  #this lists num of chapters and verses for all books in the WLC
    books_xml_r = wlc_index_xml.getroot().find('tanach')
    for book in books_xml_r.findall('book'):
        canonical_name = book.find('./names/name').text
        print canonical_name
        with open("%s/%s.json" % (directory, canonical_name), 'r') as filep:
            file_text = filep.read()
        sefaria_book = Helper.getKnownTexts(canonical_name)
        Helper.postText(sefaria_book['title'], file_text, False)
def do_book_comparison(book_index_xml):
    # check number of chapters.
    # compare to the parsed Leningrad itself
    # compare to the current sefaria versions.

    canonical_name = book_index_xml.find("./names/name").text
    print canonical_name
    diff_file = open("results/%s_wlc_koren.html" % canonical_name, "wb+")
    length_results = open("results/length_comparison.txt", "ab+")

    wlc_chapter_count = int(book_index_xml.find("./cs").text)  # listed length of the leningrad chapters
    with open("preprocess_json/%s.json" % canonical_name, "r") as filep:
        wlc_text = json.load(filep)["text"]
    wlc_real_chapter_count = len(wlc_text)  # physical length of the parsed leningrad chapters

    sefaria_book = Helper.getKnownTexts(canonical_name)
    sefria_chapter_count = sefaria_book["lengths"][0]
    sefaria_text = Helper.api_get_text(
        "%s 1-%s" % (sefaria_book["title"], sefria_chapter_count), "he", "Tanach with Ta'amei Hamikra"
    )["he"]
    if sefria_chapter_count == 1:
        sefaria_text = [sefaria_text]
    sefaria_real_chapter_count = len(sefaria_text)

    if not all_same([wlc_chapter_count, wlc_real_chapter_count, sefria_chapter_count, sefaria_real_chapter_count]):
        ch_res_str = (
            "%s: Leningrad has %s chapters listed and %s chapters in text. Sefaria version has %s chapters listed and %s chapters in text\n"
            % (
                canonical_name.encode("utf-8"),
                wlc_chapter_count,
                wlc_real_chapter_count,
                sefria_chapter_count,
                sefaria_real_chapter_count,
            )
        )
        length_results.write(ch_res_str)

    for chapter in book_index_xml.findall("c"):
        ch_num = int(chapter.get("n"))
        wlc_verse_count = int(chapter.find("vs").text)
        wlc_real_verse_count = len(wlc_text[ch_num - 1])
        sefaria_real_verse_count = len(sefaria_text[ch_num - 1])
        if not all_same([wlc_verse_count, wlc_real_verse_count, sefaria_real_verse_count]):
            v_res_str = (
                "%s:%s Leningrad has %s verses listed and %s verses in text. Sefaria version has %s verses\n"
                % (
                    canonical_name.encode("utf-8"),
                    ch_num,
                    wlc_verse_count,
                    wlc_real_verse_count,
                    sefaria_real_verse_count,
                )
            )
            length_results.write(v_res_str)

    html_diff = difflib.HtmlDiff().make_file(
        flatten_text(make_consonantal_text(wlc_text)),
        flatten_text(make_consonantal_text(sefaria_text)),
        "Leningrad Codex",
        "Sefaria/Koren",
    )
    html_diff = html_diff.replace("charset=ISO-8859-1", "charset=utf-8")
    diff_file.write(html_diff.encode("utf-8"))

    length_results.close()
    diff_file.close()