def raph_alignment_report(ja_smk, letter_ja): csv_lst = [] lst_raph = [] smk_siman = 0 smk_pages = map_semak_page_siman(ja_smk, to_print=False) for seg in traverse_ja(ja_smk): for raph_l_in_smk in re.finditer(u'@55([\u05d0-\u05ea]{1,3})', seg['data']): lst_raph.append((raph_l_in_smk.group(1), seg['data'][raph_l_in_smk.span()[0] - 20: raph_l_in_smk.span()[1] + 20], (seg['indices'][0] + 1))) raph_11 = [] for raph in traverse_ja(letter_ja): raph_11.append(raph) # re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1)) page = 21 prob = 0 for raph, smk_l in zip(raph_11, lst_raph): print re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), smk_l[0], numToHeb(smk_l[2]) csv_dict = {u'smk letter': smk_l[0], u'raph letter': re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), u'smk words': smk_l[1], u'raph line': raph['data'], u'siman': numToHeb(smk_l[2]), u'aprx page in scan': smk_pages[numToHeb(smk_l[2])]} if re.search(u'@77', smk_l[1]): page += 1 if re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1) != smk_l[0]: prob += 1 print "*" csv_dict['problem'] = True # break csv_lst.append(csv_dict) print 'prob', prob print 'done' toCSV(u'testcsvreport', csv_lst, [u'smk letter', u'raph letter', u'smk words', u'raph line', u'siman', u'aprx page in scan', u'problem']) return csv_lst
def link_semak_raph(smk_ja, raph_ja): #if segment in smak_ja has a @55[\u05d0-\u05ea]{0,3} extract the letter and match it to the segment in the ja_raph #by running on the ja_raph segments smk_raph = [] raph_letter = [] for seg in traverse_ja(smk_ja): if re.search(u'@55[\u05d0-\u05ea]{0,3}', seg['data']): for letter in re.findall(u'@55([\u05d0-\u05ea]{0,3})', seg['data']): # smk_raph.append([seg['indices'][:], letter]) smk_raph.append([letter, seg['indices']]) last = [-1, -1] for seg in traverse_ja(raph_ja): if seg['indices'][0:2] == last[0:2]: continue else: raph_letter.append(seg) last = seg['indices'] problem_count = 0 for smk, raph in zip(smk_raph, raph_letter): if getGematria(smk[0]) == (raph['indices'][1]+1): print getGematria(smk[0]), raph['indices'][1]+1, \ [item+1 for item in smk[1]], [item +1 for item in raph['indices']] else: problem_count +=1 print 'problem:', getGematria(smk[0]), raph['indices'][1]+1,\ [item+1 for item in smk[1]], [item +1 for item in raph['indices']] print problem_count
def link_semak_raph(smk_ja, raph_ja): #if segment in smak_ja has a @55[\u05d0-\u05ea]{0,3} extract the letter and match it to the segment in the ja_raph #by running on the ja_raph segments smk_raph = [] raph_letter = [] for seg in traverse_ja(smk_ja): if re.search(u'@55[\u05d0-\u05ea]{0,3}', seg['data']): for letter in re.findall(u'@55([\u05d0-\u05ea]{0,3})', seg['data']): # smk_raph.append([seg['indices'][:], letter]) smk_raph.append([letter, seg['indices']]) last = [-1, -1] for seg in traverse_ja(raph_ja): if seg['indices'][0:2] == last[0:2]: continue else: raph_letter.append(seg) last = seg['indices'] problem_count = 0 for smk, raph in zip(smk_raph, raph_letter): if getGematria(smk[0]) == (raph['indices'][1] + 1): print getGematria(smk[0]), raph['indices'][1]+1, \ [item+1 for item in smk[1]], [item +1 for item in raph['indices']] else: problem_count += 1 print 'problem:', getGematria(smk[0]), raph['indices'][1]+1,\ [item+1 for item in smk[1]], [item +1 for item in raph['indices']] print problem_count
def link_raph(ja_smk, ja_raph_simanim): # look how to get this information where it is coming from. # ja_raph_simanim = siman, letter links = [] i = 0 prev_siman = 1 for seg in traverse_ja(ja_smk): for x in re.findall(u'@55', seg['data']): # if re.search(u'@55', seg['data']): siman = seg['indices'][0] + 1 if siman != prev_siman: i = 0 prev_siman = siman segment = seg['indices'][1] + 1 i += 1 link = ( { "refs": [ "Sefer Mitzvot Katan {}:{}".format(siman, segment), "Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan {}:{}".format(siman, i), # really should be a ref link to the whole raph ], "type": "commentary", 'inline_reference': { 'data-commentator': 'Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan', 'data-order': i }, "auto": True, "generated_by": "semak_parser" }) # dh_text = dh['data'] # append to links list links.append(link) return links
def link_raavad(text_ja): # create the link objects btween the dibur HaMatchil and the main text links = [] # use a generator to go over the text and find the 3 level indices for dh in traverse_ja(text_ja): link = ({ "refs": [ "Raavad on Sefer Yetzirah " + '%d:%d:%d' % tuple(x + 1 for x in dh['indices']), "Sefer Yetzirah " + '%d:%d' % tuple(x + 1 for x in dh['indices'][:2]), ], "type": "commentary", "auto": True, "generated_by": "raavad_parse" }) dh_text = dh['data'] # append to links list links.append(link) # shave off the last link of "slik" shpuldn't be linked in links.pop() return links
def linker(parsed_commentary, commentator_name): """ Build up a list of links for a text where the commentator follows the base text exactly :param parsed_commentary: parsed text to link :param commentator_name: Name commentator as appears on the JaggedArrayNode to be linked :return: list of links """ links = [] for comment in traverse_ja(parsed_commentary): indices = [i + 1 for i in comment['indices']] links.append({ 'refs': [ 'Sefer Yetzirah {}:{}'.format(*indices[:-1]), '{} {}:{}:{}'.format(commentator_name, *indices) ], 'type': 'commentary', 'auto': True, 'generated_by': 'Sefer Yetzirah Parse Script' }) return links
def test_traverse_ja(): test_ja = [['foo', 'bar'], ['hello', 'world']] explicit_data = [ {'data': 'foo', 'indices': [0, 0]}, {'data': 'bar', 'indices': [0, 1]}, {'data': 'hello', 'indices': [1, 0]}, {'data': 'world', 'indices': [1, 1]} ] for test_item, explicit_item in zip(util.traverse_ja(test_ja), explicit_data): assert test_item == explicit_item
def build_table(old_ja): j = 0 ind_list = [] for x in traverse_ja(old_ja): k = x['indices'] # k0 = '{}'.format(k[0]) # k1 = '{}'.format(k[1]) # k2 = '{}'.format(k[2]) # l = [j_str,k0,k1,k2] l = [j, k[0], k[1], k[2]] ind_list.append(l) j += 1 return ind_list
def build_links(parsed_data): link_bases = [] for book in library.get_indexes_in_category('Torah'): for segment in traverse_ja(parsed_data[book]): link_bases.append('{} {}:{}'.format(book, *[i+1 for i in segment['indices']])) return [{ 'refs': [base, 'Tafsir Rasag, {}'.format(base)], 'type': 'targum', 'auto': False, 'generated_by': 'Tafsir Rasag Parse script' } for base in link_bases]
def build_links(parsed): bases = [] for book_num, book in enumerate(parsed): for line in traverse_ja(book): bases.append('{} Chronicles {}:{}'.format('I'*(book_num+1), *[i+1 for i in line['indices']])) links = [{ 'refs': [base, 'Aramaic Targum to {}'.format(base)], 'type': 'targum', 'auto': False, 'generated_by': 'Chronicles parse script' }for base in bases] return links
def raph_alignment_report(ja_smk, letter_ja): csv_lst = [] lst_raph = [] smk_siman = 0 smk_pages = map_semak_page_siman(ja_smk, to_print=False) for seg in traverse_ja(ja_smk): for raph_l_in_smk in re.finditer(u'@55([\u05d0-\u05ea]{1,3})', seg['data']): lst_raph.append((raph_l_in_smk.group(1), seg['data'][raph_l_in_smk.span()[0] - 20: raph_l_in_smk.span()[1] + 20], (seg['indices'][0] + 1))) raph_11 = [] for raph in traverse_ja(letter_ja): raph_11.append(raph) # re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1)) page = 21 prob = 0 i = 0 for raph, smk_l in zip(letter_ja, lst_raph): # zip(raph_11, lst_raph): # print re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), smk_l[0], numToHeb(smk_l[2]) csv_dict = {u'smk letter': smk_l[0], u'raph': raph[i], u'siman': numToHeb(smk_l[2]), u'aprx page in scan': smk_pages[numToHeb(smk_l[2])]} # u'raph letter': re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1), u'raph line': raph['data'] # u'smk words': smk_l[1], i += 0 if re.search(u'@77', smk_l[1]): page += 1 # if re.search(u'@11([\u05d0-\u05ea]{1,3})', raph['data']).group(1) != smk_l[0]: # prob += 1 # print "*" # csv_dict['problem'] = True # # break csv_lst.append(csv_dict) print 'prob', prob print 'done' toCSV(u'testcsvreport', csv_lst, [u'smk letter', u'raph', u'siman', u'aprx page in scan']) #, u'problem', u'smk words',u'raph line', return csv_lst
def build_links(parsed_data): link_bases = [] for book in library.get_indexes_in_category('Torah'): for segment in traverse_ja(parsed_data[book]): link_bases.append('{} {}:{}'.format( book, *[i + 1 for i in segment['indices']])) return [{ 'refs': [base, 'Tafsir Rasag, {}'.format(base)], 'type': 'targum', 'auto': False, 'generated_by': 'Tafsir Rasag Parse script' } for base in link_bases]
def linker(dict_of_ja): links = [] for book in library.get_indexes_in_category('Torah'): for segment in util.traverse_ja(dict_of_ja[book]): refs = [u'{}.{}.{}'.format(book, *[x+1 for x in segment['indices'][:-1]]), u'Baal HaTurim, {}.{}.{}.{}'.format(book, *[x+1 for x in segment['indices']])] links.append( { 'refs': refs, 'type': 'commentary', 'auto': False, 'generated_by': 'Baal HaTurim parse script' } ) return links
def map_semak_page_siman(smk_ja, to_print=True): ''' create a dictionary from key: siman value: page(s) that the siman is on :param smk_ja: smk ja parsed according to simanim @22 :return: dictionary. keys: siman (he letter), value: list of pages the siman spans over. (pages according to scan - starts on p. 21) ''' siman_page = OrderedDict() page_count = 21 start_page = False lst_seg = {'data': '', 'indices': []} for seg in traverse_ja(smk_ja): for i, page in enumerate(re.finditer(u'@77', seg['data'])): page_count += 1 try: siman_page[numToHeb(seg['indices'][0] + 1)].append(page_count) except KeyError: if not start_page: siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count - 1, page_count] start_page = False else: siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count] if re.search(u'@77 ?$', lst_seg['data']): start_page = True siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count) if not list(re.finditer(u'@77', seg['data'])): try: siman_page[numToHeb(seg['indices'][0] + 1)] except KeyError: siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count] if re.search(u'@77 ?$', lst_seg['data']): start_page = True try: siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count) except ValueError: pass lst_seg = seg if to_print: for k in siman_page.keys(): print k, siman_page[k] return siman_page
def linker(parsed_commentary, commentator_name): """ Build up a list of links for a text where the commentator follows the base text exactly :param parsed_commentary: parsed text to link :param commentator_name: Name commentator as appears on the JaggedArrayNode to be linked :return: list of links """ links = [] for comment in traverse_ja(parsed_commentary): indices = [i + 1 for i in comment['indices']] links.append({ 'refs': ['Sefer Yetzirah {}:{}'.format(*indices[:-1]), '{} {}:{}:{}'.format(commentator_name, *indices)], 'type': 'commentary', 'auto': True, 'generated_by': 'Sefer Yetzirah Parse Script' }) return links
def map_semak_page_siman(smk_ja, to_print = True): ''' create a dictionary from key: siman value: page(s) that the siman is on :param smk_ja: smk ja parsed according to simanim @22 :return: dictionary. keys: siman (he letter), value: list of pages the siman spans over. (pages according to scan - starts on p. 21) ''' siman_page = OrderedDict() page_count = 21 start_page = False lst_seg = {'data': '', 'indices': []} for seg in traverse_ja(smk_ja): for i, page in enumerate(re.finditer(u'@77', seg['data'])): page_count += 1 try: siman_page[numToHeb(seg['indices'][0]+1)].append(page_count) except KeyError: if not start_page: siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count - 1, page_count] start_page = False else: siman_page[numToHeb(seg['indices'][0]+1)] = [page_count] if re.search(u'@77 ?$', lst_seg['data']): start_page = True siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count) if not list(re.finditer(u'@77', seg['data'])): try: siman_page[numToHeb(seg['indices'][0]+1)] except KeyError: siman_page[numToHeb(seg['indices'][0] + 1)] = [page_count] if re.search(u'@77 ?$', lst_seg['data']): start_page = True try: siman_page[numToHeb(lst_seg['indices'][0] + 1)].remove(page_count) except ValueError: pass lst_seg = seg if to_print: for k in siman_page.keys(): print k, siman_page[k] return siman_page
def link_bs(text_dict): links = [] for text in text_dict.keys(): book = re.match(u'(.*?)\s', text).group().strip() for dh in traverse_ja(text_dict[text]): perek = (dh['indices'][0] + 1) pasuk = (dh['indices'][1] + 1) comment = (dh['indices'][2]+1) link = ( { "refs": [ 'Bekhor Shor, {} {}:{}:{}'.format(book, perek, pasuk, comment), '{} {}:{}'.format(book,perek, pasuk), ], "type": "commentary", "auto": True, "generated_by": "bekhor_shor_parser" }) # append to links list links.append(link) return links
def link_raavad(text_ja): # create the link objects btween the dibur HaMatchil and the main text links = [] # use a generator to go over the text and find the 3 level indices for dh in traverse_ja(text_ja): link = ( { "refs": [ "Raavad on Sefer Yetzirah " + '%d:%d:%d' % tuple(x + 1 for x in dh['indices']), "Sefer Yetzirah " + '%d:%d' % tuple(x + 1 for x in dh['indices'][:2]), ], "type": "commentary", "auto": True, "generated_by": "raavad_parse" }) dh_text = dh['data'] # append to links list links.append(link) # shave off the last link of "slik" shpuldn't be linked in links.pop() return links
def link_raph(ja_smk, ja_raph_simanim ): # look how to get this information where it is coming from. # ja_raph_simanim = siman, letter links = [] i = 0 prev_siman = 1 for seg in traverse_ja(ja_smk): for x in re.findall(u'@55', seg['data']): # if re.search(u'@55', seg['data']): siman = seg['indices'][0] + 1 if siman != prev_siman: i = 0 prev_siman = siman segment = seg['indices'][1] + 1 i += 1 link = ({ "refs": [ "Sefer Mitzvot Katan {}:{}".format(siman, segment), "Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan {}:{}". format(siman, i), # really should be a ref link to the whole raph ], "type": "commentary", 'inline_reference': { 'data-commentator': 'Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan', 'data-order': i }, "auto": True, "generated_by": "semak_parser" }) # dh_text = dh['data'] # append to links list links.append(link) return links
def links(text_dict): links = [] for book in text_dict.keys(): for dh in traverse_ja(text_dict[book]): perek = (dh['indices'][0] + 1) pasuk = (dh['indices'][1] + 1) comment = (dh['indices'][2] + 1) link = ({ "refs": [ 'Tur HaAroch, {} {}:{}:{}'.format(book, perek, pasuk, comment), '{} {}:{}'.format(book, perek, pasuk), ], "type": "commentary", "auto": True, "generated_by": "tur_torah_parser" }) # append to links list links.append(link) return links
index_dict = { 'title': 'HaGra on Sefer Yetzirah Gra Version', 'categories': ['Commentary2','Kabbalah','Gra'], 'schema': schema.serialize() # This line converts the schema into json } post_index(index_dict) post_text('HaGra on Sefer Yetzirah Gra Version', text_version, index_count='on') # post with the post function post_this() # create the link objects btween the dibur HaMatchil of the GRA and the main text gra_links = [] # use a generator to go over the text and find the 3 level indices for dh in traverse_ja(gra): link = ( { "refs": [ "HaGra on Sefer Yetzirah Gra Version " + '%d:%d:%d' %tuple(x+1 for x in dh['indices']), "Sefer Yetzirah Gra Version " + '%d:%d' %tuple(x+1 for x in dh['indices'][:2]), ], "type": "commentary", "auto": True, "generated_by": "gra_parse" }) dh_text = dh['data'] # append to links list gra_links.append(link) # shave off the last link of "slik" shpuldn't be linked in
def generate_links(parsed_data, link_filename='fixed_links.xml', error_file='errors.csv'): """ Using an xml of data from daat and parsed text, generate all links :param parsed_data: Dictionary keys are books of Torah, values are parsed text into ja. :param link_filename: Filename of xml file that holds link data. :param error_file: Filename of csv file which contains all comments that could not be linked. :return: List of link objects """ links, errors = [], [] root = ET.parse(link_filename).getroot() for book in library.get_indexes_in_category('Torah'): book_element = root.find(book) for comment in util.traverse_ja(parsed_data[book], bottom=basestring): good_verse = True chapter, verse = comment['indices'][0], comment['indices'][1] # get the verse from the xml verse_element = book_element.find( "./chapter[@chap_index='{}']/verse[@verse_index='{}']".format( chapter + 1, verse + 1)) rashis = Ref('Rashi on {}.{}.{}'.format(book, chapter + 1, verse + 1)) total_rashis = len(rashis.all_subrefs()) if verse_element is None: good_verse = False # compare number of Rashis on daat and sefaria. If only one Rashi link can be made elif total_rashis != int(verse_element.find( 'total_rashis').text) and total_rashis != 1: good_verse = False # compare number of siftei chakhmim on daat and Torat Emet elif len(parsed_data[book][chapter][verse]) != len( verse_element.findall('comment')): good_verse = False if good_verse: # grab the exact Rashi comment number to link to comment_number = comment['indices'][2] comment_element = verse_element.findall( 'comment')[comment_number] if total_rashis == 1: rashi_value = 1 else: rashi_value = int(comment_element.attrib['rashi_comment']) refs = [ u'Siftei Hakhamim, {}.{}.{}.{}'.format( book, *[x + 1 for x in comment['indices']]), u'Rashi on {}.{}.{}.{}'.format(book, chapter + 1, verse + 1, rashi_value) ] # build the link object links.append({ 'refs': refs, 'type': 'commentary', 'auto': False, 'generated_by': 'Siftei Hakhamim parse script' }) else: bad_link = [book] bad_link.extend([x + 1 for x in comment['indices']]) url = 'draft.sefaria.org/Siftei_Hakhamim,_{}.{}.{}.{}'\ .format(book, *[x+1 for x in comment['indices']]) bad_link.append(url) errors.append(bad_link) # write errors to csv file with open(error_file, 'w') as outfile: writer = csv.writer(outfile, delimiter=';') writer.writerow(['Book', 'Chapter', 'Verse', 'Comment', 'url']) writer.writerows(errors) return links
soup = soupAndOpen("./pages/%s" % (filename)) if siman_num is 3 or siman_num is 4 or siman_num is 7: #siman numbers that did not conform to be able to parse print "outlier", siman_num outlierParse(soup, siman_num) else: print "regular", siman_num regularParse(soup, siman_num) ja_to_xml(simanim_ja.array(), ["siman", "seif", "comment"]) links = [] for comment in traverse_ja(simanim_ja.array()): links.append({ 'refs': [ 'Shulchan_Arukh, Orach_Chayim.{}.{}'.format( comment['indices'][0] - 1, comment['indices'][1] - 1), 'Biur Halacha.{}.{}.{}'.format( *[i - 1 for i in comment['indices']]) ], 'type': 'commentary', 'auto': True, 'generated_by': 'Biur Halacha linker' })
def generate_links(parsed_data, link_filename='fixed_links.xml', error_file='errors.csv'): """ Using an xml of data from daat and parsed text, generate all links :param parsed_data: Dictionary keys are books of Torah, values are parsed text into ja. :param link_filename: Filename of xml file that holds link data. :param error_file: Filename of csv file which contains all comments that could not be linked. :return: List of link objects """ links, errors = [], [] root = ET.parse(link_filename).getroot() for book in library.get_indexes_in_category('Torah'): book_element = root.find(book) for comment in util.traverse_ja(parsed_data[book], bottom=basestring): good_verse = True chapter, verse = comment['indices'][0], comment['indices'][1] # get the verse from the xml verse_element = book_element.find("./chapter[@chap_index='{}']/verse[@verse_index='{}']" .format(chapter+1, verse+1)) rashis = Ref('Rashi on {}.{}.{}'.format(book, chapter+1, verse+1)) total_rashis = len(rashis.all_subrefs()) if verse_element is None: good_verse = False # compare number of Rashis on daat and sefaria. If only one Rashi link can be made elif total_rashis != int(verse_element.find('total_rashis').text) and total_rashis != 1: good_verse = False # compare number of siftei chakhmim on daat and Torat Emet elif len(parsed_data[book][chapter][verse]) != len(verse_element.findall('comment')): good_verse = False if good_verse: # grab the exact Rashi comment number to link to comment_number = comment['indices'][2] comment_element = verse_element.findall('comment')[comment_number] if total_rashis == 1: rashi_value = 1 else: rashi_value = int(comment_element.attrib['rashi_comment']) refs = [u'Siftei Hakhamim, {}.{}.{}.{}'.format(book, *[x+1 for x in comment['indices']]), u'Rashi on {}.{}.{}.{}'.format(book, chapter+1, verse+1, rashi_value)] # build the link object links.append({ 'refs': refs, 'type': 'commentary', 'auto': False, 'generated_by': 'Siftei Hakhamim parse script' }) else: bad_link = [book] bad_link.extend([x+1 for x in comment['indices']]) url = 'draft.sefaria.org/Siftei_Hakhamim,_{}.{}.{}.{}'\ .format(book, *[x+1 for x in comment['indices']]) bad_link.append(url) errors.append(bad_link) # write errors to csv file with open(error_file, 'w') as outfile: writer = csv.writer(outfile, delimiter=';') writer.writerow(['Book', 'Chapter', 'Verse', 'Comment', 'url']) writer.writerows(errors) return links
'schema': schema.serialize() # This line converts the schema into json } post_index(index_dict) post_text('HaGra on Sefer Yetzirah Gra Version', text_version, index_count='on') # post with the post function post_this() # create the link objects btween the dibur HaMatchil of the GRA and the main text gra_links = [] # use a generator to go over the text and find the 3 level indices for dh in traverse_ja(gra): link = ({ "refs": [ "HaGra on Sefer Yetzirah Gra Version " + '%d:%d:%d' % tuple(x + 1 for x in dh['indices']), "Sefer Yetzirah Gra Version " + '%d:%d' % tuple(x + 1 for x in dh['indices'][:2]), ], "type": "commentary", "auto": True, "generated_by": "gra_parse" }) dh_text = dh['data']