def count_foot(books, regexes): """ loop through books and count number of times regex appears :param books: dictionary containing entire jps 1985 translation :param regexes: list of regular expressions to compare :return: number of appearances of the regular expression """ all_books = library.get_indexes_in_category('Tanach') total = 0 # define replacement dictionary replacements = { u'\n': u'', u'\r': u'', u'\n\r': u'', } # loop through books, and combine each chapter into one string for book in all_books: for num, chapter in enumerate(books[book]): # make chapter into single string (to account for footnotes spanning multiple lines and verses) text = [] for verse in chapter: text.append(functions.multiple_replace(verse, replacements)) text = ' '.join(text) counts = [] for reg in regexes: counts.append(len(re.findall(reg, text))) cannon = counts[0] for index, count in enumerate(counts): if count != cannon: print '{}: {}; {}'.format(book, num+1, index) total += 1 break return total
def count_foot(books, regexes): """ loop through books and count number of times regex appears :param books: dictionary containing entire jps 1985 translation :param regexes: list of regular expressions to compare :return: number of appearances of the regular expression """ all_books = library.get_indexes_in_category('Tanach') total = 0 # define replacement dictionary replacements = { u'\n': u'', u'\r': u'', u'\n\r': u'', } # loop through books, and combine each chapter into one string for book in all_books: for num, chapter in enumerate(books[book]): # make chapter into single string (to account for footnotes spanning multiple lines and verses) text = [] for verse in chapter: text.append(functions.multiple_replace(verse, replacements)) text = ' '.join(text) counts = [] for reg in regexes: counts.append(len(re.findall(reg, text))) cannon = counts[0] for index, count in enumerate(counts): if count != cannon: print '{}: {}; {}'.format(book, num + 1, index) total += 1 break return total
def footnote_linker(jps, jps_footnotes): """ Create a list of link objects with the anchorText field set to the corresponding words in the verse. Once the corresponding text has been found, edit the jps text so as to replace the footnote mark with an <i> tag. Trailing footnote markers at the end of an enclosed text fragment will be scrubbed. :param jps: jps Tanach data structure - jps[<book_name>][chapter_index][verse_index] :param jps_footnotes: jps footnotes data struct - footnotes[<book_name>][chap_index][footnote]. Note that this returns a dictionary with the keys [footnote], which gives the footnote text, and [links], which give the verses to which the footnote needs to link to. :return: A list of link objects. This function will edit jps to replace inline footnote markers with <i> tags, as well as remove the beginning footnote characters used to match the note to printed text. """ # get books of Tanach books = library.get_indexes_in_category('Tanach') # open error file errors = codecs.open('footnote_errors.txt', 'w', 'utf-8') # declare link array links = [] # iterate through jps_footnotes for book in books: for chap_num, chapter in enumerate(jps_footnotes[book]): for index, note in enumerate(chapter): # get tag to identify footnote in main text. Account for case where tag is 'aa'. if note['footnote'][0] == u'a' and note['footnote'][1] == u'a': tag = u'aa' else: tag = u'{}'.format(note['footnote'][0]) # first word of footnote is the tag. That data has been saved - now strip from the footnote note['footnote'] = u' '.join(note['footnote'].split()[1:]) # compile regexes to account for enclosed text open_tag_reg = re.compile(u'\[{}\]-'.format(tag)) close_tag_reg = re.compile(u'-\[{}\]'.format(tag)) enclosed_reg = re.compile(u'\[{0}\]-.*?-\[{0}\]'.format(tag)) # <i> tag to replace footnote marker in text if tag == u'aa': data_order = 27 else: data_order = ord(tag) - 96 itag = u'<i data-commentator="JPS 1985 Footnotes" data-order="{}"></i>'.format(data_order) # iterate over the links for link in note['links']: # set flag for default footnote behaviour default = True # get verse in main text try: verse = jps[book][chap_num][link-1] except IndexError: print u'{},{},{}'.format(book, chap_num+1, link) continue # search using regexes open_tag = open_tag_reg.search(verse) close_tag = close_tag_reg.search(verse) enclosed = enclosed_reg.search(verse) # declare anchor for text anchor = u'' # check for enclosed comment if enclosed: # catch enclosed text anchor = enclosed.group()[len(tag)+3:-(len(tag)+3)] # replace leading footnote tags with <i> tag, strip out trailing tag replace = {u'[{}]-'.format(tag): itag, u'-[{}]'.format(tag): u''} jps[book][chap_num][link-1] = functions.multiple_replace(verse, replace) # sanity check - scrub out any remaining footnote tags jps[book][chap_num][link-1] = jps[book][chap_num][link-1].replace(u'[{}]'.format(tag), u'') default = False # if not enclosed comment, make sanity check elif open_tag: # check if anomaly can be resolved by looking at the next verse next_verse = jps[book][chap_num][link] combined = u' '.join([verse, next_verse]) enclosed = enclosed_reg.search(combined) if enclosed: anchor = enclosed.group()[len(tag) + 3:-(len(tag) + 3)] jps[book][chap_num][link-1] = verse.replace(u'[{}]-'.format(tag), itag) jps[book][chap_num][link] = next_verse.replace(u'-[{}]'.format(tag), u'') jps[book][chap_num][link-1] = jps[book][chap_num][link-1].replace(u'[{}]'.format(tag), u'') default = False if default: # sanity check, make sure tag is in verse if verse.find(u'[{}]'.format(tag)) == -1: errors.write(u'tag not found\n') errors.write(u'{}, {}, {}, {}\n'.format(book, chap_num+1, link, note['footnote'])) continue else: # remove footnote tag from main text try: jps[book][chap_num][link-1] = verse.replace(u'[{}]'.format(tag), itag) except IndexError: print u'{},{},{}'.format(book, chap_num+1, link) # get preceding word words = verse[:verse.find(u'[{}]'.format(tag))].split() if len(words) > 0: anchor = words[len(words)-1] else: anchor = u'' # create link object links.append({ 'refs': [u'{}.{}.{}'.format(book, chap_num+1, link), u'JPS 1985 Footnotes, {}.{}.{}'.format(book, chap_num+1, index+1)], 'type': 'commentary', 'auto': True, 'generated_by': 'JPS parse script', 'anchorText': anchor, }) errors.close() return links
def align_footnotes(books): """ The footnotes need to be structured by book and chapter. Each footnote may refer to multiple verses. :param books: Dictionary, containing the entire JPS 1985 translation return: Dictionary, with books as keys and chapters as values. Each chapter is a list of dictionaries, with the key "footnote" set to the footnote and the key "links" being a list of verses where the footnote appears. """ jps_footnotes = {} # define replacement dictionary replacements = { u'@': u'\u1e63', u'h%': u'\u1e25', u'H%': u'\u1e24', u'\n': u'', u'\r': u'', u'\n\r': u'', } # get list of book in tanach all_books = library.get_indexes_in_category('Tanach') # open footnote document and retrieve first footnote input_file = codecs.open('JPS1985_footnotes.txt', 'r', 'utf-8') note = {'footnote': functions.multiple_replace(input_file.readline(), replacements)} # iterate through Tanach for book in all_books: # set dictionary to with keys set to chapter numbers footnote_chaps = {} for chap_num, chapter in enumerate(books[book]): # set flag to indicate if any footnote markers have been found found_note = False # declare array to hold all note in chapter chap_notes = [] # repeatedly loop through chapter, searching for cases where footnote appears while True: # account for the case where a footnote is tagged "aa" if note['footnote'].find(u'aa') == 0: tag = u'aa' else: tag = note['footnote'][0] found = [] for verse_num, verse in enumerate(chapter): if verse.find(u'[{}]'.format(tag)) != -1: found.append(verse_num+1) found_note = True note['links'] = found # if footnote markers were found, get the next footnote if found_note: chap_notes.append(note) note = {'footnote': functions.multiple_replace(input_file.readline(), replacements)} if note['footnote'] == u'': footnote_chaps[chap_num] = chap_notes break # if footnote begins with "a", this is a new chapter try: if note['footnote'][0] == u'a' and note['footnote'][1] != u'a': footnote_chaps[chap_num] = chap_notes break except IndexError: print 'error' print note['footnote'] print u'{}, chapter {}'.format(book, chap_num+1) input_file.close() sys.exit(1) jps_footnotes[book] = functions.convertDictToArray(footnote_chaps) input_file.close() return jps_footnotes
def parse(): # declare variables Books, Chapters, Verses = {}, [], [] previous_line = u'' book_name = u'' errors = [] # regular expressions chapter_reg = re.compile(u'\d{1,3}\s') verse_reg = re.compile(u'\d{1,3}[a-zA-Z\-"“\[‘(—]') footnote_reg = re.compile(u'\[[a-z]\]') input_file = codecs.open('JPSTanakhMaster.txt', 'r', 'utf-8') # define replacement dictionary replacements = { u'H%': u'\u1e24', u'h%': u'\u1e25', u'\n': u'', u'\r': u'', } # loop through file for line in input_file: # if this line is a parsha name - do nothing if line == line.upper(): continue # make necessary replacements and strip footnotes line = functions.multiple_replace(line, replacements) '''footnotes = footnote_reg.findall(line) for case in footnotes: line = line.replace(case, u'')''' # check if line is beginning of new chapter new_chap = chapter_reg.match(line) if new_chap: # get chapter num chap_number = int(new_chap.group()) if chap_number == 1: # save previous book if book_name != u'': Chapters.append((process_verses(u''.join(Verses), verse_reg))) check_verses(Chapters[len(Chapters)-1], (book_name.replace(u'\r', u';'), len(Chapters)), errors) Books[book_name] = Chapters Chapters = [] Verses = [] book_name = previous_line else: Verses.append(previous_line) Chapters.append(process_verses(u''.join(Verses), verse_reg)) check_verses(Chapters[len(Chapters) - 1], (book_name.replace(u'\r', u';'), chap_number - 1), errors) Verses = [] # check that chapters are incrementing correctly if chap_number - len(Chapters) != 1: errors.append((book_name.replace(u'\r', u';'), chap_number-1)) Chapters.append([u'error']) # copy line into previous_line placeholder, excluding the chapter number itself # if chapter number is 1, this is a new book previous_line = line[new_chap.end():] else: # Add previous line to verses, and save current line Verses.append(previous_line) previous_line = line # add last book Chapters.append((process_verses(u''.join(Verses), verse_reg))) check_verses(Chapters[len(Chapters) - 1], (book_name.replace(u'\r', u';'), len(Chapters)), errors) Books[book_name] = Chapters input_file.close() out = codecs.open('output.txt', 'w', 'utf-8') write_to_file(Books, out) out.close() # write errors out = codecs.open('errors.txt', 'w', 'utf-8') for error in errors: out.write('Book: {0} Chapter: {1}\n'.format(*error)) out.close() print len(errors) return Books
def footnote_linker(jps, jps_footnotes): """ Create a list of link objects with the anchorText field set to the corresponding words in the verse. Once the corresponding text has been found, edit the jps text so as to replace the footnote mark with an <i> tag. Trailing footnote markers at the end of an enclosed text fragment will be scrubbed. :param jps: jps Tanach data structure - jps[<book_name>][chapter_index][verse_index] :param jps_footnotes: jps footnotes data struct - footnotes[<book_name>][chap_index][footnote]. Note that this returns a dictionary with the keys [footnote], which gives the footnote text, and [links], which give the verses to which the footnote needs to link to. :return: A list of link objects. This function will edit jps to replace inline footnote markers with <i> tags, as well as remove the beginning footnote characters used to match the note to printed text. """ # get books of Tanach books = library.get_indexes_in_category('Tanach') # open error file errors = codecs.open('footnote_errors.txt', 'w', 'utf-8') # declare link array links = [] # iterate through jps_footnotes for book in books: for chap_num, chapter in enumerate(jps_footnotes[book]): for index, note in enumerate(chapter): # get tag to identify footnote in main text. Account for case where tag is 'aa'. if note['footnote'][0] == u'a' and note['footnote'][1] == u'a': tag = u'aa' else: tag = u'{}'.format(note['footnote'][0]) # first word of footnote is the tag. That data has been saved - now strip from the footnote note['footnote'] = u' '.join(note['footnote'].split()[1:]) # compile regexes to account for enclosed text open_tag_reg = re.compile(u'\[{}\]-'.format(tag)) close_tag_reg = re.compile(u'-\[{}\]'.format(tag)) enclosed_reg = re.compile(u'\[{0}\]-.*?-\[{0}\]'.format(tag)) # <i> tag to replace footnote marker in text if tag == u'aa': data_order = 27 else: data_order = ord(tag) - 96 itag = u'<i data-commentator="JPS 1985 Footnotes" data-order="{}"></i>'.format( data_order) # iterate over the links for link in note['links']: # set flag for default footnote behaviour default = True # get verse in main text try: verse = jps[book][chap_num][link - 1] except IndexError: print u'{},{},{}'.format(book, chap_num + 1, link) continue # search using regexes open_tag = open_tag_reg.search(verse) close_tag = close_tag_reg.search(verse) enclosed = enclosed_reg.search(verse) # declare anchor for text anchor = u'' # check for enclosed comment if enclosed: # catch enclosed text anchor = enclosed.group()[len(tag) + 3:-(len(tag) + 3)] # replace leading footnote tags with <i> tag, strip out trailing tag replace = { u'[{}]-'.format(tag): itag, u'-[{}]'.format(tag): u'' } jps[book][chap_num][link - 1] = functions.multiple_replace( verse, replace) # sanity check - scrub out any remaining footnote tags jps[book][chap_num][link - 1] = jps[book][chap_num][ link - 1].replace(u'[{}]'.format(tag), u'') default = False # if not enclosed comment, make sanity check elif open_tag: # check if anomaly can be resolved by looking at the next verse next_verse = jps[book][chap_num][link] combined = u' '.join([verse, next_verse]) enclosed = enclosed_reg.search(combined) if enclosed: anchor = enclosed.group()[len(tag) + 3:-(len(tag) + 3)] jps[book][chap_num][link - 1] = verse.replace( u'[{}]-'.format(tag), itag) jps[book][chap_num][link] = next_verse.replace( u'-[{}]'.format(tag), u'') jps[book][chap_num][ link - 1] = jps[book][chap_num][link - 1].replace( u'[{}]'.format(tag), u'') default = False if default: # sanity check, make sure tag is in verse if verse.find(u'[{}]'.format(tag)) == -1: errors.write(u'tag not found\n') errors.write(u'{}, {}, {}, {}\n'.format( book, chap_num + 1, link, note['footnote'])) continue else: # remove footnote tag from main text try: jps[book][chap_num][link - 1] = verse.replace( u'[{}]'.format(tag), itag) except IndexError: print u'{},{},{}'.format( book, chap_num + 1, link) # get preceding word words = verse[:verse.find(u'[{}]'.format(tag) )].split() if len(words) > 0: anchor = words[len(words) - 1] else: anchor = u'' # create link object links.append({ 'refs': [ u'{}.{}.{}'.format(book, chap_num + 1, link), u'JPS 1985 Footnotes, {}.{}.{}'.format( book, chap_num + 1, index + 1) ], 'type': 'commentary', 'auto': True, 'generated_by': 'JPS parse script', 'anchorText': anchor, }) errors.close() return links
def align_footnotes(books): """ The footnotes need to be structured by book and chapter. Each footnote may refer to multiple verses. :param books: Dictionary, containing the entire JPS 1985 translation return: Dictionary, with books as keys and chapters as values. Each chapter is a list of dictionaries, with the key "footnote" set to the footnote and the key "links" being a list of verses where the footnote appears. """ jps_footnotes = {} # define replacement dictionary replacements = { u'@': u'\u1e63', u'h%': u'\u1e25', u'H%': u'\u1e24', u'\n': u'', u'\r': u'', u'\n\r': u'', } # get list of book in tanach all_books = library.get_indexes_in_category('Tanach') # open footnote document and retrieve first footnote input_file = codecs.open('JPS1985_footnotes.txt', 'r', 'utf-8') note = { 'footnote': functions.multiple_replace(input_file.readline(), replacements) } # iterate through Tanach for book in all_books: # set dictionary to with keys set to chapter numbers footnote_chaps = {} for chap_num, chapter in enumerate(books[book]): # set flag to indicate if any footnote markers have been found found_note = False # declare array to hold all note in chapter chap_notes = [] # repeatedly loop through chapter, searching for cases where footnote appears while True: # account for the case where a footnote is tagged "aa" if note['footnote'].find(u'aa') == 0: tag = u'aa' else: tag = note['footnote'][0] found = [] for verse_num, verse in enumerate(chapter): if verse.find(u'[{}]'.format(tag)) != -1: found.append(verse_num + 1) found_note = True note['links'] = found # if footnote markers were found, get the next footnote if found_note: chap_notes.append(note) note = { 'footnote': functions.multiple_replace(input_file.readline(), replacements) } if note['footnote'] == u'': footnote_chaps[chap_num] = chap_notes break # if footnote begins with "a", this is a new chapter try: if note['footnote'][ 0] == u'a' and note['footnote'][1] != u'a': footnote_chaps[chap_num] = chap_notes break except IndexError: print 'error' print note['footnote'] print u'{}, chapter {}'.format(book, chap_num + 1) input_file.close() sys.exit(1) jps_footnotes[book] = functions.convertDictToArray(footnote_chaps) input_file.close() return jps_footnotes
def parse(): # declare variables Books, Chapters, Verses = {}, [], [] previous_line = u'' book_name = u'' errors = [] # regular expressions chapter_reg = re.compile(u'\d{1,3}\s') verse_reg = re.compile(u'\d{1,3}[a-zA-Z\-"“\[‘(—]') footnote_reg = re.compile(u'\[[a-z]\]') input_file = codecs.open('JPSTanakhMaster.txt', 'r', 'utf-8') # define replacement dictionary replacements = { u'H%': u'\u1e24', u'h%': u'\u1e25', u'\n': u'', u'\r': u'', } # loop through file for line in input_file: # if this line is a parsha name - do nothing if line == line.upper(): continue # make necessary replacements and strip footnotes line = functions.multiple_replace(line, replacements) '''footnotes = footnote_reg.findall(line) for case in footnotes: line = line.replace(case, u'')''' # check if line is beginning of new chapter new_chap = chapter_reg.match(line) if new_chap: # get chapter num chap_number = int(new_chap.group()) if chap_number == 1: # save previous book if book_name != u'': Chapters.append((process_verses(u''.join(Verses), verse_reg))) check_verses( Chapters[len(Chapters) - 1], (book_name.replace(u'\r', u';'), len(Chapters)), errors) Books[book_name] = Chapters Chapters = [] Verses = [] book_name = previous_line else: Verses.append(previous_line) Chapters.append(process_verses(u''.join(Verses), verse_reg)) check_verses(Chapters[len(Chapters) - 1], (book_name.replace(u'\r', u';'), chap_number - 1), errors) Verses = [] # check that chapters are incrementing correctly if chap_number - len(Chapters) != 1: errors.append((book_name.replace(u'\r', u';'), chap_number - 1)) Chapters.append([u'error']) # copy line into previous_line placeholder, excluding the chapter number itself # if chapter number is 1, this is a new book previous_line = line[new_chap.end():] else: # Add previous line to verses, and save current line Verses.append(previous_line) previous_line = line # add last book Chapters.append((process_verses(u''.join(Verses), verse_reg))) check_verses(Chapters[len(Chapters) - 1], (book_name.replace(u'\r', u';'), len(Chapters)), errors) Books[book_name] = Chapters input_file.close() out = codecs.open('output.txt', 'w', 'utf-8') write_to_file(Books, out) out.close() # write errors out = codecs.open('errors.txt', 'w', 'utf-8') for error in errors: out.write('Book: {0} Chapter: {1}\n'.format(*error)) out.close() print len(errors) return Books