示例#1
0
def check_segments():

    segments = []

    infile = codecs.open(filename, 'r', 'utf-8')

    headers = TagTester(u'@30', infile, u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header()
    tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)')

    while not tester.eof:

        segments.append(tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1))

    infile.close()

    for sec_number, section in enumerate(segments):

        index = 1

        for title in section:

            title = title.replace(u'"', u'')
            count = util.getGematria(title)

            if count != index:

                print headers[sec_number-1]
                print util.numToHeb(index)
                index = count
            index += 1
示例#2
0
def check_segments():

    segments = []

    infile = codecs.open(filename, 'r', 'utf-8')

    headers = TagTester(u'@30', infile,
                        u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header()
    tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)')

    while not tester.eof:

        segments.append(
            tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1))

    infile.close()

    for sec_number, section in enumerate(segments):

        index = 1

        for title in section:

            title = title.replace(u'"', u'')
            count = util.getGematria(title)

            if count != index:

                print headers[sec_number - 1]
                print util.numToHeb(index)
                index = count
            index += 1
示例#3
0
def check_chapters():
    with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch:
        test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})')

        index = 1

        for header in test.grab_each_header(capture_group=1):

            header = header.replace(u'"', u'')
            count = util.getGematria(header)

            if count != index:
                print util.numToHeb(index)
                index = count
            index += 1
示例#4
0
def check_chapters():
    with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch:
        test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})')

        index = 1

        for header in test.grab_each_header(capture_group=1):

            header = header.replace(u'"', u'')
            count = util.getGematria(header)

            if count != index:
                print util.numToHeb(index)
                index = count
            index += 1
示例#5
0
def raph_alignment_report(ja_smk, letter_ja):
    csv_lst = []
    lst_raph = []
    smk_siman = 0
    smk_pages = map_semak_page_siman(ja_smk, to_print=False)
    for seg in traverse_ja(ja_smk):
        for raph_l_in_smk in re.finditer(u'@55([\u05d0-\u05ea]{1,3})', seg['data']):
            lst_raph.append((raph_l_in_smk.group(1),
                             seg['data'][raph_l_in_smk.span()[0] - 20: raph_l_in_smk.span()[1] + 20],
                             (seg['indices'][0] + 1)))
    raph_11 = []
    for raph in traverse_ja(letter_ja):
        raph_11.append(raph)  # re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1))
    page = 21
    prob = 0
    for raph, smk_l in zip(raph_11, lst_raph):

        print re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), smk_l[0], numToHeb(smk_l[2])
        csv_dict = {u'smk letter': smk_l[0], u'raph letter': re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1),
                    u'smk words': smk_l[1], u'raph line': raph['data'], u'siman': numToHeb(smk_l[2]), u'aprx page in scan': smk_pages[numToHeb(smk_l[2])]}
        if re.search(u'@77', smk_l[1]):
            page += 1
        if re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1) != smk_l[0]:
            prob += 1
            print "*"
            csv_dict['problem'] = True
            # break
        csv_lst.append(csv_dict)
    print 'prob', prob
    print 'done'
    toCSV(u'testcsvreport', csv_lst, [u'smk letter', u'raph letter', u'smk words',
                                u'raph line', u'siman', u'aprx page in scan', u'problem'])
    return csv_lst
示例#6
0
def hagahot_alignment(ja_smk, ja_raph, ja_hagahot):
    ja_smk = JaggedArray(ja_smk)
    ja_raph = JaggedArray(ja_raph)
    ja_hagahot = JaggedArray(ja_hagahot)
    # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())):
    dict_lst = []
    dict = {u'siman': [], u'smk': [], u'raph': []}
    for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())):
        # print numToHeb(i+1)
        dict['siman'] = numToHeb(i + 1)
        for i, smk_line in enumerate(seg[0]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)',
                                  smk_line)
            if hag_lett:
                dict['smk'].extend([(hag_l, i + 1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        # print 'RAPH'
        for i, raph_line in enumerate(seg[1]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)',
                                  raph_line)
            if hag_lett:
                dict['raph'].extend([(hag_l, i + 1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        dict_lst.append(dict)
        dict = {u'siman': [], u'smk': [], u'raph': []}
    return dict_lst
def chapter_seven(number):
    hebrew_letter = util.numToHeb(number)
    chapter = SchemaNode()
    chapter.add_title('Chapter {}'.format(number), "en", primary=True)
    chapter.add_title(u'{} {}'.format(u'סימן', hebrew_letter), "he", primary=True)
    chapter.key = 'Chapter {}'.format(number)
    chapter.append(create_intro_nodes())
    chapter.append(create_shorash_node())
    return chapter
def chapter_nine(number):
    hebrew_letter = util.numToHeb(number)
    chapter = JaggedArrayNode()
    chapter.add_title('Chapter {}'.format(number), "en", primary=True)
    chapter.add_title(u'{} {}'.format(u'סימן', hebrew_letter), "he", primary=True)
    chapter.key = 'Chapter {}'.format(number)
    chapter.depth = 2
    chapter.addressTypes = ["Integer", "Integer"]
    chapter.sectionNames = ["Section", "Mitzvah"]
    return chapter
示例#9
0
def generate_URLs(books):
    urls = []
    opening = u"""https://he.wikisource.org/wiki/מלבי"ם_על_"""
    for book_title in books:
        book = library.get_index(book_title)
        heTitle = book.get_title('he')
        for perek_n, perek in enumerate(book.all_section_refs()):
            url = u"{}{}_{}".format(opening, heTitle, numToHeb(perek_n+1))
            urls.append((url, book_title, perek_n+1))
    return urls
    def output(self, filename=u'temp_result.txt'):
        full_text = []
        for com_siman, base_siman in zip(self.commentary_simanim, self.base_simanim):
            if not self.almost_equals(com_siman['total'], base_siman['total_refs']):
                print "Divergence in siman {}".format(base_siman['num'])

            full_text.append(u'@12{}\n'.format(numToHeb(base_siman['num'])))
            full_text.extend(self.source_lines[com_siman['start']:com_siman['end']+1])
        with codecs.open(filename, 'w', 'utf-8') as outfile:
            outfile.writelines(full_text)
def regular_chapter_nodes(number):
    hebrew_letter = util.numToHeb(number)
    chapter = JaggedArrayNode()
    chapter.add_title('Chapter {}'.format(number), "en", primary=True)
    chapter.add_title(u'{} {}'.format(u'סימן',hebrew_letter), "he", primary=True)
    chapter.key = 'Chapter {}'.format(number)
    chapter.depth = 1
    chapter.addressTypes = ["Integer"]
    chapter.sectionNames = ["Comment"]
    return chapter
    def output(self, filename=u'temp_result.txt'):
        full_text = []
        for com_siman, base_siman in zip(self.commentary_simanim, self.base_simanim):
            if not self.almost_equals(com_siman['total'], base_siman['total_refs']):
                print "Divergence in siman {}. {} in base and {} in commentary".format\
                    (base_siman['num'], base_siman['total_refs'], com_siman['total'])

            full_text.append(u'@12{}\n'.format(numToHeb(base_siman['num'])))
            full_text.extend(self.source_lines[com_siman['start']:com_siman['end']+1])
        with codecs.open(filename, 'w', 'utf-8') as outfile:
            outfile.writelines(full_text)
示例#13
0
def insert_chapter_marker(filename, safe_mode=False):
    with codecs.open(filename, 'r', 'utf-8') as infile:
        lines = infile.readlines()
    count = 0
    new_lines = []
    for line in lines:
        if re.search(u'^@22\u05d0( |$)', line) is not None:
            count += 1
            new_lines.append(u'@00\u05e4\u05e8\u05e7 {}\n{}'.format(numToHeb(count), line))
        else:
            new_lines.append(line)
    if safe_mode:
        filename += '.tmp'
    with codecs.open(filename, 'w', 'utf-8') as outfile:
        outfile.writelines(new_lines)
示例#14
0
def insert_chapter_marker(filename, safe_mode=False):
    with codecs.open(filename, 'r', 'utf-8') as infile:
        lines = infile.readlines()
    count = 0
    new_lines = []
    for line in lines:
        if re.search(u'^@22\u05d0( |$)', line) is not None:
            count += 1
            new_lines.append(u'@00\u05e4\u05e8\u05e7 {}\n{}'.format(
                numToHeb(count), line))
        else:
            new_lines.append(line)
    if safe_mode:
        filename += '.tmp'
    with codecs.open(filename, 'w', 'utf-8') as outfile:
        outfile.writelines(new_lines)
示例#15
0
def fix_file(filepath, start_siman, test_mode=False):
    output_list = []
    with codecs.open(filepath, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    counter = 0
    for line in lines:
        match = re.match(u'^@11([\u05d0-\u05ea]{1,3})$', line)
        if match and getGematria(match.group(1)) == 1:
            output_list.append(u'@00{}\n'.format(
                numToHeb(counter + start_siman)))
            counter += 1
        output_list.append(line)
    if test_mode:
        filepath = re.sub(ur'\.txt$', u'_test.txt', filepath)
    with codecs.open(filepath, 'w', 'utf-8') as fp:
        fp.writelines(output_list)
示例#16
0
def map_semak_page_siman(smk_ja, to_print=True):
    '''
    create a dictionary from key: siman value: page(s) that the siman is on
    :param smk_ja: smk ja parsed according to simanim @22
    :return: dictionary. keys: siman (he letter), value: list of pages the siman spans over. (pages according to scan -
    starts on p. 21)
    '''
    siman_page = OrderedDict()
    page_count = 21
    start_page = False
    lst_seg = {'data': '', 'indices': []}
    for seg in traverse_ja(smk_ja):
        for i, page in enumerate(re.finditer(u'@77', seg['data'])):
            page_count += 1
            try:
                siman_page[numToHeb(seg['indices'][0] + 1)].append(page_count)
            except KeyError:
                if not start_page:
                    siman_page[numToHeb(seg['indices'][0] +
                                        1)] = [page_count - 1, page_count]
                    start_page = False
                else:
                    siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count]
            if re.search(u'@77 ?$', lst_seg['data']):
                start_page = True
                siman_page[numToHeb(lst_seg['indices'][0] +
                                    1)].remove(page_count)
        if not list(re.finditer(u'@77', seg['data'])):
            try:
                siman_page[numToHeb(seg['indices'][0] + 1)]
            except KeyError:
                siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count]
            if re.search(u'@77 ?$', lst_seg['data']):
                start_page = True
                try:
                    siman_page[numToHeb(lst_seg['indices'][0] +
                                        1)].remove(page_count)
                except ValueError:
                    pass
        lst_seg = seg
    if to_print:
        for k in siman_page.keys():
            print k, siman_page[k]
    return siman_page
示例#17
0
def map_semak_page_siman(smk_ja, to_print = True):
    '''
    create a dictionary from key: siman value: page(s) that the siman is on
    :param smk_ja: smk ja parsed according to simanim @22
    :return: dictionary. keys: siman (he letter), value: list of pages the siman spans over. (pages according to scan -
    starts on p. 21)
    '''
    siman_page = OrderedDict()
    page_count = 21
    start_page = False
    lst_seg = {'data': '', 'indices': []}
    for seg in traverse_ja(smk_ja):
        for i, page in enumerate(re.finditer(u'@77', seg['data'])):
            page_count += 1
            try:
                siman_page[numToHeb(seg['indices'][0]+1)].append(page_count)
            except KeyError:
                if not start_page:
                    siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count - 1, page_count]
                    start_page = False
                else:
                    siman_page[numToHeb(seg['indices'][0]+1)] = [page_count]
            if re.search(u'@77 ?$', lst_seg['data']):
                start_page = True
                siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count)
        if not list(re.finditer(u'@77', seg['data'])):
            try:
                siman_page[numToHeb(seg['indices'][0]+1)]
            except KeyError:
                siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count]
            if re.search(u'@77 ?$', lst_seg['data']):
                start_page = True
                try:
                    siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count)
                except ValueError:
                    pass
        lst_seg = seg
    if to_print:
        for k in siman_page.keys():
            print k, siman_page[k]
    return siman_page
示例#18
0
def hagahot_alignment(ja_smk, ja_raph, ja_hagahot):
    ja_smk = JaggedArray(ja_smk)
    ja_raph = JaggedArray(ja_raph)
    ja_hagahot = JaggedArray(ja_hagahot)
    # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())):
    dict_lst = []
    dict = {u'siman':[], u'smk':[], u'raph':[]}
    for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())):
        # print numToHeb(i+1)
        dict['siman'] = numToHeb(i+1)
        for i, smk_line in enumerate(seg[0]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', smk_line)
            if hag_lett:
                dict['smk'].extend([(hag_l, i+1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        # print 'RAPH'
        for i, raph_line in enumerate(seg[1]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', raph_line)
            if hag_lett:
                dict['raph'].extend([(hag_l, i+1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        dict_lst.append(dict)
        dict = {u'siman': [], u'smk': [], u'raph': []}
    return dict_lst
示例#19
0
def raph_alignment_report(ja_smk, letter_ja):
    csv_lst = []
    lst_raph = []
    smk_siman = 0
    smk_pages = map_semak_page_siman(ja_smk, to_print=False)
    for seg in traverse_ja(ja_smk):
        for raph_l_in_smk in re.finditer(u'@55([\u05d0-\u05ea]{1,3})', seg['data']):
            lst_raph.append((raph_l_in_smk.group(1),
                             seg['data'][raph_l_in_smk.span()[0] - 20: raph_l_in_smk.span()[1] + 20],
                             (seg['indices'][0] + 1)))
    raph_11 = []
    for raph in traverse_ja(letter_ja):
        raph_11.append(raph)  # re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1))
    page = 21
    prob = 0
    i = 0
    for raph, smk_l in zip(letter_ja, lst_raph):  # zip(raph_11, lst_raph):

        # print re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), smk_l[0], numToHeb(smk_l[2])
        csv_dict = {u'smk letter': smk_l[0],  u'raph': raph[i], u'siman': numToHeb(smk_l[2]), u'aprx page in scan': smk_pages[numToHeb(smk_l[2])]}
        # u'raph letter': re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), u'raph line': raph['data']
        # u'smk words': smk_l[1],
        i += 0
        if re.search(u'@77', smk_l[1]):
            page += 1
        # if re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1) != smk_l[0]:
        #     prob += 1
        #     print "*"
        #     csv_dict['problem'] = True
        #     # break
        csv_lst.append(csv_dict)
    print 'prob', prob
    print 'done'
    toCSV(u'testcsvreport', csv_lst, [u'smk letter', u'raph',
                                 u'siman', u'aprx page in scan'])  #, u'problem', u'smk words',u'raph line',
    return csv_lst
示例#20
0
def test_num_to_heb():
    assert util.numToHeb(16) == 'טז'
    assert util.numToHeb(962) == 'תתקסב'
def mark_simanim(volume_number):

    def transition(index_a, index_b):
        return (index_b - index_a) % 22 != 1 and index_b == 1

    def terminate():
        full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num)))
        full_text.extend(current_siman_text)
        full_text.extend(lines[line_num:])
        with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile:
            outfile.writelines(full_text)

    def get_next_siman(siman_list):
        siman, total_refs = None, 0
        while total_refs == 0:
            siman = siman_list.next()
            total_refs = len(siman.locate_references(u'@44'))
        return siman, total_refs


    with codecs.open(filenames['part_{}'.format(volume_number)], 'r', 'utf-8') as infile:
        lines = infile.readlines()

    volume = Root('../../Orach_Chaim.xml').get_base_text().get_volume(1)
    simanim = iter(volume.get_child())
    current_siman, expected_refs = get_next_siman(simanim)

    full_text, current_siman_text = [],[]
    count = 0
    seif_markers = (None, None)

    for line_num, line in enumerate(lines):
        match = re.search(u'^@11([\u05d0-\u05ea])', line)
        if match:
            count += 1
            seif_markers = (seif_markers[1], he_ord(match.group(1)))

            if count - expected_refs == 1:
                if match.group(1) == u'א':
                    full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num)))
                    full_text.extend(current_siman_text)
                    current_siman_text = []
                    count = 1
                    try:
                        current_siman, expected_refs = get_next_siman(simanim)
                    except StopIteration:
                        print "Ran out of Simanim"
                        terminate()
                        return
                else:
                    print "Siman {}: Completed refs before transition occurred".format(current_siman.num)
                    terminate()
                    return
            elif None not in seif_markers and transition(*seif_markers):
                print "Siman {}: Transition occurred before completing refs".format(current_siman.num)
                terminate()
                return
        current_siman_text.append(line)
    full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num)))
    full_text.extend(current_siman_text)
    with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile:
        outfile.writelines(full_text)
 def terminate():
     full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num)))
     full_text.extend(current_siman_text)
     full_text.extend(lines[line_num:])
     with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile:
         outfile.writelines(full_text)
示例#23
0
sys.path.insert(0, p)
from local_settings import *
sys.path.insert(0, SEFARIA_PROJECT_PATH)
os.environ['DJANGO_SETTINGS_MODULE'] = "local_settings"

from data_utilities.util import numToHeb

reload(sys)
sys.setdefaultencoding("utf-8")


def wikiGet(url, title):

    try:
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        page = opener.open(url)
        print "got", title
        with open("./pages/{}".format(title), "w") as file:
            file.write(page.read())

    except:
        print "page doesn't exist", title


for siman in range(1, 697):  #696 simanim in O.C
    title = "Biur_Halacha." + str(siman)
    wikiGet(
        u"https://he.wikisource.org/w/index.php?title=ביאור_הלכה_על_אורח_חיים_%s&printable=yes"
        % (numToHeb(siman)), title)
示例#24
0
 def repl(m):
     siman = getGematria(m.group(1))
     if siman >= increment_start:
         siman -= 1
     return u'@00{}'.format(numToHeb(siman))
def mark_simanim(volume_number):
    def transition(index_a, index_b):
        return (index_b - index_a) % 22 != 1 and index_b == 1

    def terminate():
        full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num)))
        full_text.extend(current_siman_text)
        full_text.extend(lines[line_num:])
        with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile:
            outfile.writelines(full_text)

    def get_next_siman(siman_list):
        siman, total_refs = None, 0
        while total_refs == 0:
            siman = siman_list.next()
            total_refs = len(siman.locate_references(u'@44'))
        return siman, total_refs

    with codecs.open(filenames['part_{}'.format(volume_number)], 'r',
                     'utf-8') as infile:
        lines = infile.readlines()

    volume = Root('../../Orach_Chaim.xml').get_base_text().get_volume(1)
    simanim = iter(volume.get_child())
    current_siman, expected_refs = get_next_siman(simanim)

    full_text, current_siman_text = [], []
    count = 0
    seif_markers = (None, None)

    for line_num, line in enumerate(lines):
        match = re.search(u'^@11([\u05d0-\u05ea])', line)
        if match:
            count += 1
            seif_markers = (seif_markers[1], he_ord(match.group(1)))

            if count - expected_refs == 1:
                if match.group(1) == u'א':
                    full_text.append(u'@12{}\n'.format(
                        numToHeb(current_siman.num)))
                    full_text.extend(current_siman_text)
                    current_siman_text = []
                    count = 1
                    try:
                        current_siman, expected_refs = get_next_siman(simanim)
                    except StopIteration:
                        print "Ran out of Simanim"
                        terminate()
                        return
                else:
                    print "Siman {}: Completed refs before transition occurred".format(
                        current_siman.num)
                    terminate()
                    return
            elif None not in seif_markers and transition(*seif_markers):
                print "Siman {}: Transition occurred before completing refs".format(
                    current_siman.num)
                terminate()
                return
        current_siman_text.append(line)
    full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num)))
    full_text.extend(current_siman_text)
    with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile:
        outfile.writelines(full_text)
 def terminate():
     full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num)))
     full_text.extend(current_siman_text)
     full_text.extend(lines[line_num:])
     with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile:
         outfile.writelines(full_text)