def parse_the_text(file_name_teshuvot, file_name_footnotes, dictionary): teshuvot_ja = function.parse(file_name_teshuvot) footnotes_ja = function.parse(file_name_footnotes) links = function.create_links(teshuvot_ja, dictionary) index_teshuvot = function.create_index(dictionary) index_footnotes = function.create_index(dictionary,footnotes, footnotes_hebrew) teshuvot_ja = util.clean_jagged_array(teshuvot_ja, ['\d+', '\+']) footnotes_ja = util.clean_jagged_array(footnotes_ja, ['\d+', '\+']) text_teshuvot = function.create_text(dictionary, teshuvot_ja) text_footnotes = function.create_text(dictionary, footnotes_ja) functions.post_index(index_teshuvot) functions.post_index(index_footnotes) functions.post_text_weak_connection('Teshuvot haRashba part {}'.format(dictionary['roman numeral']), text_teshuvot) functions.post_text_weak_connection('Footnotes to Teshuvot haRashba part {}'.format(dictionary['roman numeral']), text_footnotes) functions.post_link_weak_connection(links)
def post(): minchat = {'name': 'Minchat Chinuch', 'text': produce_parsed_data(filename)} sefer = {'name': 'Sefer HaChinukh', 'text': Ref('Sefer HaChinukh').text('he').text} chinukh_links = find_links(minchat, sefer, grab_dh, u'<b>', u'</b>') with codecs.open('links.txt', 'w', 'utf-8') as outfile: for each_link in chinukh_links: outfile.write(u'{}\n'.format(each_link['refs'])) alt = construct_alt_struct('Chinukh_by_Parsha.csv', 'Chinukh Mitzva names.csv') cleaned = util.clean_jagged_array(minchat['text'], [m_pattern, comment_pattern, u'@[0-9]{2}', u'\n', u'\r']) with codecs.open('parsed.txt', 'w', 'utf-8') as outfile: util.jagged_array_to_file(outfile, cleaned, [u'Mitzva', u'Seif', u'Paragraph']) full_text = { 'versionTitle': 'Minchat Chinuch, Piotrków, 1902', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001175092', 'language': 'he', 'text': cleaned } index = construct_index(alt) functions.post_index(index) functions.post_text('Minchat Chinuch', full_text) functions.post_link(chinukh_links)
def parse_the_text(file_name_teshuvot, file_name_footnotes, dictionary): teshuvot_ja = function.parse(file_name_teshuvot) footnotes_ja = function.parse(file_name_footnotes) links = function.create_links(teshuvot_ja, dictionary) index_teshuvot = function.create_index(dictionary) index_footnotes = function.create_index(dictionary, footnotes, footnotes_hebrew) teshuvot_ja = util.clean_jagged_array(teshuvot_ja, ['\d+', '\+']) footnotes_ja = util.clean_jagged_array(footnotes_ja, ['\d+', '\+']) text_teshuvot = function.create_text(dictionary, teshuvot_ja) text_footnotes = function.create_text(dictionary, footnotes_ja) functions.post_index(index_teshuvot) functions.post_index(index_footnotes) functions.post_text_weak_connection( 'Teshuvot haRashba part {}'.format(dictionary['roman numeral']), text_teshuvot) functions.post_text_weak_connection( 'Footnotes to Teshuvot haRashba part {}'.format( dictionary['roman numeral']), text_footnotes) functions.post_link_weak_connection(links)
def parse(): book_names = library.get_indexes_in_category('Torah') names = node_names() parsed = {} for book_num, filename in enumerate(filenames()): with codecs.open(filename, 'r', 'utf-8') as infile: current = util.file_to_ja([[[]]], infile, [u'@88', u'@44'], sefat_parse_helper).array() parsed[book_names[book_num]] = util.clean_jagged_array(current, [u'@[0-9]{2}', u'\?']) for book in book_names: parashot = names[book].keys() parsed[book] = util.simple_to_complex(parashot, parsed[book]) for parsha in parashot: parsed[book][parsha] = util.simple_to_complex(names[book][parsha], parsed[book][parsha]) return parsed
def parse(): book_names = library.get_indexes_in_category('Torah') names = node_names() parsed = {} for book_name, filename in zip(book_names, filenames()): with codecs.open(filename, 'r', 'utf-8') as infile: current = util.file_to_ja(2, infile, [u'@88'], sefat_parse_helper).array() parsed[book_name] = util.clean_jagged_array( current, [u'@[0-9]{2}', u'\?']) for book in book_names: parashot = names[book].keys() parsed[book] = util.simple_to_complex(parashot, parsed[book]) return parsed
def parse_and_post(filename, index_key): with codecs.open(filename, 'r', 'utf-8') as source_file: data = util.file_to_ja([[]], source_file, [u'@00'], structure_boaz) data = util.clean_jagged_array(data.array(), strip_list) source_file.seek(0) data = align_boaz_chapters(source_file, data) text_version = { 'versionTitle': u'Mishnah, ed. Romm, Vilna 1913', 'versionSource': 'http://http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001741739', 'language': 'he', 'text': data } functions.post_text(index_key, text_version)
def post(): minchat = { 'name': 'Minchat Chinuch', 'text': produce_parsed_data(filename) } sefer = { 'name': 'Sefer HaChinukh', 'text': Ref('Sefer HaChinukh').text('he').text } chinukh_links = find_links(minchat, sefer, grab_dh, u'<b>', u'</b>') with codecs.open('links.txt', 'w', 'utf-8') as outfile: for each_link in chinukh_links: outfile.write(u'{}\n'.format(each_link['refs'])) alt = construct_alt_struct('Chinukh_by_Parsha.csv', 'Chinukh Mitzva names.csv') cleaned = util.clean_jagged_array( minchat['text'], [m_pattern, comment_pattern, u'@[0-9]{2}', u'\n', u'\r']) with codecs.open('parsed.txt', 'w', 'utf-8') as outfile: util.jagged_array_to_file(outfile, cleaned, [u'Mitzva', u'Seif', u'Paragraph']) full_text = { 'versionTitle': 'Minchat Chinuch, Piotrków, 1902', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001175092', 'language': 'he', 'text': cleaned } index = construct_index(alt) functions.post_index(index) functions.post_text('Minchat Chinuch', full_text) functions.post_link(chinukh_links)
""" tractate = { "versionTitle": "Vilna Mishna", "versionSource": "http://www.daat.ac.il/encyclopedia/value.asp?id1=836", "language": "he", "text": text, } print 'uploading {}'.format(text_name) functions.post_text(text_name, tractate) trello = open('trello_board.json') tracs = get_cards_from_trello('Parse Mishnah', trello) trello.close() for trac in tracs: name = re.search(u'[\u05d0-\u05ea].+', trac) name = name.group().replace(u'משנה', u'משניות') infile = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8') jagged = jaggedarray_from_file(infile, u'@00(?:\u05e4\u05e8\u05e7 |\u05e4)([\u05d0-\u05ea"]{1,3})', u'@22[\u05d0-\u05ea]{1,2}', u'@99') parsed = util.clean_jagged_array(jagged, clean_list()) infile.close() en_name = re.search(u'[a-zA-Z ]+', trac).group().rstrip() if en_name not in tractates: print '{} not a valid ref'.format(en_name) sys.exit(1) upload(parsed, en_name)
def test_clean_jagged_array(): dirty_ja = [['foo&', 'bar&'], ['hello&', 'world&']] clean_ja = [['foo', 'bar'], ['hello', 'world']] result = util.clean_jagged_array(dirty_ja, ['&']) assert result == clean_ja assert result != dirty_ja
""" parse the text create links create index clean the parse create text post """ import codecs import regex from sefaria.model import * from data_utilities import util from sources.Yad_Ramah import function from sources import functions sanhedrin_ja = TextChunk(Ref('Sanhedrin'), 'he').text yad_ramah = function.parse('yad_ramah.txt') yad_ramah = util.clean_jagged_array(yad_ramah, ['(@22)', '(@100)']) index = function.create_index() text = function.create_text(yad_ramah) links = function.create_links(sanhedrin_ja, yad_ramah) functions.post_index(index) functions.post_text('Yad Ramah on Sanhedrin', text) functions.post_link(links) # hello = codecs.open("hello.txt", 'w', 'utf-8') # util.jagged_array_to_file(hello, yad_ramah,['Page', 'Comment']) # hello.close()
tractate = { "versionTitle": "Vilna Mishna", "versionSource": "http://www.daat.ac.il/encyclopedia/value.asp?id1=836", "language": "he", "text": text, } print 'uploading {}'.format(text_name) functions.post_text(text_name, tractate) trello = open('trello_board.json') tracs = get_cards_from_trello('Parse Mishnah', trello) trello.close() for trac in tracs: name = re.search(u'[\u05d0-\u05ea].+', trac) name = name.group().replace(u'משנה', u'משניות') infile = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8') jagged = jaggedarray_from_file( infile, u'@00(?:\u05e4\u05e8\u05e7 |\u05e4)([\u05d0-\u05ea"]{1,3})', u'@22[\u05d0-\u05ea]{1,2}', u'@99') parsed = util.clean_jagged_array(jagged, clean_list()) infile.close() en_name = re.search(u'[a-zA-Z ]+', trac).group().rstrip() if en_name not in tractates: print '{} not a valid ref'.format(en_name) sys.exit(1) upload(parsed, en_name)