def check_segments(): segments = [] infile = codecs.open(filename, 'r', 'utf-8') headers = TagTester(u'@30', infile, u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header() tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)') while not tester.eof: segments.append(tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1)) infile.close() for sec_number, section in enumerate(segments): index = 1 for title in section: title = title.replace(u'"', u'') count = util.getGematria(title) if count != index: print headers[sec_number-1] print util.numToHeb(index) index = count index += 1
def check_segments(): segments = [] infile = codecs.open(filename, 'r', 'utf-8') headers = TagTester(u'@30', infile, u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header() tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)') while not tester.eof: segments.append( tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1)) infile.close() for sec_number, section in enumerate(segments): index = 1 for title in section: title = title.replace(u'"', u'') count = util.getGematria(title) if count != index: print headers[sec_number - 1] print util.numToHeb(index) index = count index += 1
def check_chapters(): with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch: test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})') index = 1 for header in test.grab_each_header(capture_group=1): header = header.replace(u'"', u'') count = util.getGematria(header) if count != index: print util.numToHeb(index) index = count index += 1
def raph_alignment_report(ja_smk, letter_ja): csv_lst = [] lst_raph = [] smk_siman = 0 smk_pages = map_semak_page_siman(ja_smk, to_print=False) for seg in traverse_ja(ja_smk): for raph_l_in_smk in re.finditer(u'@55([\u05d0-\u05ea]{1,3})', seg['data']): lst_raph.append((raph_l_in_smk.group(1), seg['data'][raph_l_in_smk.span()[0] - 20: raph_l_in_smk.span()[1] + 20], (seg['indices'][0] + 1))) raph_11 = [] for raph in traverse_ja(letter_ja): raph_11.append(raph) # re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1)) page = 21 prob = 0 for raph, smk_l in zip(raph_11, lst_raph): print re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), smk_l[0], numToHeb(smk_l[2]) csv_dict = {u'smk letter': smk_l[0], u'raph letter': re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), u'smk words': smk_l[1], u'raph line': raph['data'], u'siman': numToHeb(smk_l[2]), u'aprx page in scan': smk_pages[numToHeb(smk_l[2])]} if re.search(u'@77', smk_l[1]): page += 1 if re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1) != smk_l[0]: prob += 1 print "*" csv_dict['problem'] = True # break csv_lst.append(csv_dict) print 'prob', prob print 'done' toCSV(u'testcsvreport', csv_lst, [u'smk letter', u'raph letter', u'smk words', u'raph line', u'siman', u'aprx page in scan', u'problem']) return csv_lst
def hagahot_alignment(ja_smk, ja_raph, ja_hagahot): ja_smk = JaggedArray(ja_smk) ja_raph = JaggedArray(ja_raph) ja_hagahot = JaggedArray(ja_hagahot) # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())): dict_lst = [] dict = {u'siman': [], u'smk': [], u'raph': []} for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())): # print numToHeb(i+1) dict['siman'] = numToHeb(i + 1) for i, smk_line in enumerate(seg[0]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', smk_line) if hag_lett: dict['smk'].extend([(hag_l, i + 1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] # print 'RAPH' for i, raph_line in enumerate(seg[1]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', raph_line) if hag_lett: dict['raph'].extend([(hag_l, i + 1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] dict_lst.append(dict) dict = {u'siman': [], u'smk': [], u'raph': []} return dict_lst
def chapter_seven(number): hebrew_letter = util.numToHeb(number) chapter = SchemaNode() chapter.add_title('Chapter {}'.format(number), "en", primary=True) chapter.add_title(u'{} {}'.format(u'סימן', hebrew_letter), "he", primary=True) chapter.key = 'Chapter {}'.format(number) chapter.append(create_intro_nodes()) chapter.append(create_shorash_node()) return chapter
def chapter_nine(number): hebrew_letter = util.numToHeb(number) chapter = JaggedArrayNode() chapter.add_title('Chapter {}'.format(number), "en", primary=True) chapter.add_title(u'{} {}'.format(u'סימן', hebrew_letter), "he", primary=True) chapter.key = 'Chapter {}'.format(number) chapter.depth = 2 chapter.addressTypes = ["Integer", "Integer"] chapter.sectionNames = ["Section", "Mitzvah"] return chapter
def generate_URLs(books): urls = [] opening = u"""https://he.wikisource.org/wiki/מלבי"ם_על_""" for book_title in books: book = library.get_index(book_title) heTitle = book.get_title('he') for perek_n, perek in enumerate(book.all_section_refs()): url = u"{}{}_{}".format(opening, heTitle, numToHeb(perek_n+1)) urls.append((url, book_title, perek_n+1)) return urls
def output(self, filename=u'temp_result.txt'): full_text = [] for com_siman, base_siman in zip(self.commentary_simanim, self.base_simanim): if not self.almost_equals(com_siman['total'], base_siman['total_refs']): print "Divergence in siman {}".format(base_siman['num']) full_text.append(u'@12{}\n'.format(numToHeb(base_siman['num']))) full_text.extend(self.source_lines[com_siman['start']:com_siman['end']+1]) with codecs.open(filename, 'w', 'utf-8') as outfile: outfile.writelines(full_text)
def regular_chapter_nodes(number): hebrew_letter = util.numToHeb(number) chapter = JaggedArrayNode() chapter.add_title('Chapter {}'.format(number), "en", primary=True) chapter.add_title(u'{} {}'.format(u'סימן',hebrew_letter), "he", primary=True) chapter.key = 'Chapter {}'.format(number) chapter.depth = 1 chapter.addressTypes = ["Integer"] chapter.sectionNames = ["Comment"] return chapter
def output(self, filename=u'temp_result.txt'): full_text = [] for com_siman, base_siman in zip(self.commentary_simanim, self.base_simanim): if not self.almost_equals(com_siman['total'], base_siman['total_refs']): print "Divergence in siman {}. {} in base and {} in commentary".format\ (base_siman['num'], base_siman['total_refs'], com_siman['total']) full_text.append(u'@12{}\n'.format(numToHeb(base_siman['num']))) full_text.extend(self.source_lines[com_siman['start']:com_siman['end']+1]) with codecs.open(filename, 'w', 'utf-8') as outfile: outfile.writelines(full_text)
def insert_chapter_marker(filename, safe_mode=False): with codecs.open(filename, 'r', 'utf-8') as infile: lines = infile.readlines() count = 0 new_lines = [] for line in lines: if re.search(u'^@22\u05d0( |$)', line) is not None: count += 1 new_lines.append(u'@00\u05e4\u05e8\u05e7 {}\n{}'.format(numToHeb(count), line)) else: new_lines.append(line) if safe_mode: filename += '.tmp' with codecs.open(filename, 'w', 'utf-8') as outfile: outfile.writelines(new_lines)
def insert_chapter_marker(filename, safe_mode=False): with codecs.open(filename, 'r', 'utf-8') as infile: lines = infile.readlines() count = 0 new_lines = [] for line in lines: if re.search(u'^@22\u05d0( |$)', line) is not None: count += 1 new_lines.append(u'@00\u05e4\u05e8\u05e7 {}\n{}'.format( numToHeb(count), line)) else: new_lines.append(line) if safe_mode: filename += '.tmp' with codecs.open(filename, 'w', 'utf-8') as outfile: outfile.writelines(new_lines)
def fix_file(filepath, start_siman, test_mode=False): output_list = [] with codecs.open(filepath, 'r', 'utf-8') as fp: lines = fp.readlines() counter = 0 for line in lines: match = re.match(u'^@11([\u05d0-\u05ea]{1,3})$', line) if match and getGematria(match.group(1)) == 1: output_list.append(u'@00{}\n'.format( numToHeb(counter + start_siman))) counter += 1 output_list.append(line) if test_mode: filepath = re.sub(ur'\.txt$', u'_test.txt', filepath) with codecs.open(filepath, 'w', 'utf-8') as fp: fp.writelines(output_list)
def map_semak_page_siman(smk_ja, to_print=True): ''' create a dictionary from key: siman value: page(s) that the siman is on :param smk_ja: smk ja parsed according to simanim @22 :return: dictionary. keys: siman (he letter), value: list of pages the siman spans over. (pages according to scan - starts on p. 21) ''' siman_page = OrderedDict() page_count = 21 start_page = False lst_seg = {'data': '', 'indices': []} for seg in traverse_ja(smk_ja): for i, page in enumerate(re.finditer(u'@77', seg['data'])): page_count += 1 try: siman_page[numToHeb(seg['indices'][0] + 1)].append(page_count) except KeyError: if not start_page: siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count - 1, page_count] start_page = False else: siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count] if re.search(u'@77 ?$', lst_seg['data']): start_page = True siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count) if not list(re.finditer(u'@77', seg['data'])): try: siman_page[numToHeb(seg['indices'][0] + 1)] except KeyError: siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count] if re.search(u'@77 ?$', lst_seg['data']): start_page = True try: siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count) except ValueError: pass lst_seg = seg if to_print: for k in siman_page.keys(): print k, siman_page[k] return siman_page
def map_semak_page_siman(smk_ja, to_print = True): ''' create a dictionary from key: siman value: page(s) that the siman is on :param smk_ja: smk ja parsed according to simanim @22 :return: dictionary. keys: siman (he letter), value: list of pages the siman spans over. (pages according to scan - starts on p. 21) ''' siman_page = OrderedDict() page_count = 21 start_page = False lst_seg = {'data': '', 'indices': []} for seg in traverse_ja(smk_ja): for i, page in enumerate(re.finditer(u'@77', seg['data'])): page_count += 1 try: siman_page[numToHeb(seg['indices'][0]+1)].append(page_count) except KeyError: if not start_page: siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count - 1, page_count] start_page = False else: siman_page[numToHeb(seg['indices'][0]+1)] = [page_count] if re.search(u'@77 ?$', lst_seg['data']): start_page = True siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count) if not list(re.finditer(u'@77', seg['data'])): try: siman_page[numToHeb(seg['indices'][0]+1)] except KeyError: siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count] if re.search(u'@77 ?$', lst_seg['data']): start_page = True try: siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count) except ValueError: pass lst_seg = seg if to_print: for k in siman_page.keys(): print k, siman_page[k] return siman_page
def hagahot_alignment(ja_smk, ja_raph, ja_hagahot): ja_smk = JaggedArray(ja_smk) ja_raph = JaggedArray(ja_raph) ja_hagahot = JaggedArray(ja_hagahot) # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())): dict_lst = [] dict = {u'siman':[], u'smk':[], u'raph':[]} for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())): # print numToHeb(i+1) dict['siman'] = numToHeb(i+1) for i, smk_line in enumerate(seg[0]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', smk_line) if hag_lett: dict['smk'].extend([(hag_l, i+1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] # print 'RAPH' for i, raph_line in enumerate(seg[1]): hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', raph_line) if hag_lett: dict['raph'].extend([(hag_l, i+1) for hag_l in hag_lett]) # print [getGematria(lett) for lett in hag_lett] dict_lst.append(dict) dict = {u'siman': [], u'smk': [], u'raph': []} return dict_lst
def raph_alignment_report(ja_smk, letter_ja): csv_lst = [] lst_raph = [] smk_siman = 0 smk_pages = map_semak_page_siman(ja_smk, to_print=False) for seg in traverse_ja(ja_smk): for raph_l_in_smk in re.finditer(u'@55([\u05d0-\u05ea]{1,3})', seg['data']): lst_raph.append((raph_l_in_smk.group(1), seg['data'][raph_l_in_smk.span()[0] - 20: raph_l_in_smk.span()[1] + 20], (seg['indices'][0] + 1))) raph_11 = [] for raph in traverse_ja(letter_ja): raph_11.append(raph) # re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1)) page = 21 prob = 0 i = 0 for raph, smk_l in zip(letter_ja, lst_raph): # zip(raph_11, lst_raph): # print re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), smk_l[0], numToHeb(smk_l[2]) csv_dict = {u'smk letter': smk_l[0], u'raph': raph[i], u'siman': numToHeb(smk_l[2]), u'aprx page in scan': smk_pages[numToHeb(smk_l[2])]} # u'raph letter': re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), u'raph line': raph['data'] # u'smk words': smk_l[1], i += 0 if re.search(u'@77', smk_l[1]): page += 1 # if re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1) != smk_l[0]: # prob += 1 # print "*" # csv_dict['problem'] = True # # break csv_lst.append(csv_dict) print 'prob', prob print 'done' toCSV(u'testcsvreport', csv_lst, [u'smk letter', u'raph', u'siman', u'aprx page in scan']) #, u'problem', u'smk words',u'raph line', return csv_lst
def test_num_to_heb(): assert util.numToHeb(16) == 'טז' assert util.numToHeb(962) == 'תתקסב'
def mark_simanim(volume_number): def transition(index_a, index_b): return (index_b - index_a) % 22 != 1 and index_b == 1 def terminate(): full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num))) full_text.extend(current_siman_text) full_text.extend(lines[line_num:]) with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile: outfile.writelines(full_text) def get_next_siman(siman_list): siman, total_refs = None, 0 while total_refs == 0: siman = siman_list.next() total_refs = len(siman.locate_references(u'@44')) return siman, total_refs with codecs.open(filenames['part_{}'.format(volume_number)], 'r', 'utf-8') as infile: lines = infile.readlines() volume = Root('../../Orach_Chaim.xml').get_base_text().get_volume(1) simanim = iter(volume.get_child()) current_siman, expected_refs = get_next_siman(simanim) full_text, current_siman_text = [],[] count = 0 seif_markers = (None, None) for line_num, line in enumerate(lines): match = re.search(u'^@11([\u05d0-\u05ea])', line) if match: count += 1 seif_markers = (seif_markers[1], he_ord(match.group(1))) if count - expected_refs == 1: if match.group(1) == u'א': full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num))) full_text.extend(current_siman_text) current_siman_text = [] count = 1 try: current_siman, expected_refs = get_next_siman(simanim) except StopIteration: print "Ran out of Simanim" terminate() return else: print "Siman {}: Completed refs before transition occurred".format(current_siman.num) terminate() return elif None not in seif_markers and transition(*seif_markers): print "Siman {}: Transition occurred before completing refs".format(current_siman.num) terminate() return current_siman_text.append(line) full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num))) full_text.extend(current_siman_text) with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile: outfile.writelines(full_text)
def terminate(): full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num))) full_text.extend(current_siman_text) full_text.extend(lines[line_num:]) with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile: outfile.writelines(full_text)
sys.path.insert(0, p) from local_settings import * sys.path.insert(0, SEFARIA_PROJECT_PATH) os.environ['DJANGO_SETTINGS_MODULE'] = "local_settings" from data_utilities.util import numToHeb reload(sys) sys.setdefaultencoding("utf-8") def wikiGet(url, title): try: opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] page = opener.open(url) print "got", title with open("./pages/{}".format(title), "w") as file: file.write(page.read()) except: print "page doesn't exist", title for siman in range(1, 697): #696 simanim in O.C title = "Biur_Halacha." + str(siman) wikiGet( u"https://he.wikisource.org/w/index.php?title=ביאור_הלכה_על_אורח_חיים_%s&printable=yes" % (numToHeb(siman)), title)
def repl(m): siman = getGematria(m.group(1)) if siman >= increment_start: siman -= 1 return u'@00{}'.format(numToHeb(siman))
def mark_simanim(volume_number): def transition(index_a, index_b): return (index_b - index_a) % 22 != 1 and index_b == 1 def terminate(): full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num))) full_text.extend(current_siman_text) full_text.extend(lines[line_num:]) with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile: outfile.writelines(full_text) def get_next_siman(siman_list): siman, total_refs = None, 0 while total_refs == 0: siman = siman_list.next() total_refs = len(siman.locate_references(u'@44')) return siman, total_refs with codecs.open(filenames['part_{}'.format(volume_number)], 'r', 'utf-8') as infile: lines = infile.readlines() volume = Root('../../Orach_Chaim.xml').get_base_text().get_volume(1) simanim = iter(volume.get_child()) current_siman, expected_refs = get_next_siman(simanim) full_text, current_siman_text = [], [] count = 0 seif_markers = (None, None) for line_num, line in enumerate(lines): match = re.search(u'^@11([\u05d0-\u05ea])', line) if match: count += 1 seif_markers = (seif_markers[1], he_ord(match.group(1))) if count - expected_refs == 1: if match.group(1) == u'א': full_text.append(u'@12{}\n'.format( numToHeb(current_siman.num))) full_text.extend(current_siman_text) current_siman_text = [] count = 1 try: current_siman, expected_refs = get_next_siman(simanim) except StopIteration: print "Ran out of Simanim" terminate() return else: print "Siman {}: Completed refs before transition occurred".format( current_siman.num) terminate() return elif None not in seif_markers and transition(*seif_markers): print "Siman {}: Transition occurred before completing refs".format( current_siman.num) terminate() return current_siman_text.append(line) full_text.append(u'@12{}\n'.format(numToHeb(current_siman.num))) full_text.extend(current_siman_text) with codecs.open('temp_result.txt', 'w', 'utf-8') as outfile: outfile.writelines(full_text)