def test_file_to_ja(): data = StringIO('''@22\nfoo\nbar\n@22\nhello\nworld''') ja = util.file_to_ja(2, data, ['@22'], lambda x: [c.rstrip() for c in x]) assert ja.array() == [ ['foo', 'bar'], ['hello', 'world'] ]
def parse_raph(filename, smk_ja): ''' :param filename: raph source txt file :param smk_ja: JA obj smk parsed [siman,segment] :return: JA obj parsed [siman, letter] some simanim will be empty ''' def cleaner(my_text): replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@(33|22)': u''} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line) and not line.isspace(): line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] try: ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml') d1 = 0 aligned = [] siman = [] segment = [] for letter in smk_ja.array(): for seg in letter: for ff in re.finditer(u'@55[\u05d0-\u05ea]{0,3}', seg): # segment.append(ja[d1]) siman.append(ja[d1]) d1 += 1 if segment != []: siman.extend(segment) #rather then append # segment = [] aligned.append(siman) siman = [] ja_to_xml(aligned, ['siman', 'letter', 'segment'], 'raph_simanim_24.xml') return JaggedArray(aligned)
def parse_yitzira(): def cleaner(my_text): return filter(None, [re.sub(u'@[0-9]{2}', u'', line) if re.search(u'@11', line) else None for line in my_text]) with codecs.open('yitzira_mishna.txt', 'r', 'utf-8') as infile: return file_to_ja(2, infile, [u'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}'], cleaner).array()
def parse_body(): def cleaner(section): bleached = [bleach.clean(segment, tags=[], strip=True) for segment in section] return filter(lambda x: None if len(x) == 0 else x, bleached) my_text = get_text().splitlines()[:1795] # A new part begins at this line expressions = [u'<b>\u05d4?\u05de\u05e2\u05d9\u05d9?\u05df.*:</b>', u'\u05de\u05e2\u05d9\u05df ([\u05d0-\u05ea]{1,2}) - \u05e0\u05d4\u05e8 (- )?([\u05d0-\u05ea]{1,2})'] parsed = file_to_ja(3, my_text, expressions, cleaner) return parsed.array()
def parse_yitzira(): def cleaner(my_text): return filter(None, [ re.sub(u'@[0-9]{2}', u'', line) if re.search(u'@11', line) else None for line in my_text ]) with codecs.open('yitzira_mishna.txt', 'r', 'utf-8') as infile: return file_to_ja(2, infile, [u'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}'], cleaner).array()
def parse_general(filename): def cleaner(my_text): result = [] for line in my_text: new_line = multiple_replace(line, {u'@31': u'<b>', u'@32': u'</b>'}) new_line = re.sub(u'@[0-9]{2}', u'', new_line) result.append(new_line) return result regs = [u'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}', u'@22[\u05d0-\u05ea]{1,2}'] with codecs.open(filename, 'r', 'utf-8') as infile: return file_to_ja(3, infile, regs, cleaner).array()
def produce_parsed_data(filename): with codecs.open(filename, 'r', 'utf-8') as datafile: parsed = util.file_to_ja(3, datafile, (m_pattern, comment_pattern), nothing) datafile.seek(0) names = util.grab_section_names(m_pattern, datafile, 1) names = [int(util.getGematria(name)) for name in names] comp_text = util.simple_to_complex(names, parsed.array()) parsed = util.convert_dict_to_array(comp_text) return parsed
def parse(): book_names = library.get_indexes_in_category('Torah') names = node_names() parsed = {} for book_num, filename in enumerate(filenames()): with codecs.open(filename, 'r', 'utf-8') as infile: current = util.file_to_ja([[[]]], infile, [u'@88', u'@44'], sefat_parse_helper).array() parsed[book_names[book_num]] = util.clean_jagged_array(current, [u'@[0-9]{2}', u'\?']) for book in book_names: parashot = names[book].keys() parsed[book] = util.simple_to_complex(parashot, parsed[book]) for parsha in parashot: parsed[book][parsha] = util.simple_to_complex(names[book][parsha], parsed[book][parsha]) return parsed
def parse(): book_names = library.get_indexes_in_category('Torah') names = node_names() parsed = {} for book_name, filename in zip(book_names, filenames()): with codecs.open(filename, 'r', 'utf-8') as infile: current = util.file_to_ja(2, infile, [u'@88'], sefat_parse_helper).array() parsed[book_name] = util.clean_jagged_array( current, [u'@[0-9]{2}', u'\?']) for book in book_names: parashot = names[book].keys() parsed[book] = util.simple_to_complex(parashot, parsed[book]) return parsed
def produce_parsed_data(filename): with codecs.open(filename, 'r', 'utf-8') as datafile: parsed = util.file_to_ja([[[]]], datafile, (m_pattern, comment_pattern), nothing) datafile.seek(0) names = util.grab_section_names(m_pattern, datafile, 1) names = [int(util.getGematria(name)) for name in names] comp_text = util.simple_to_complex(names, parsed.array()) parsed = util.convert_dict_to_array(comp_text) return parsed
def parse_and_post(filename, index_key): with codecs.open(filename, 'r', 'utf-8') as source_file: data = util.file_to_ja([[]], source_file, [u'@00'], structure_boaz) data = util.clean_jagged_array(data.array(), strip_list) source_file.seek(0) data = align_boaz_chapters(source_file, data) text_version = { 'versionTitle': u'Mishnah, ed. Romm, Vilna 1913', 'versionSource': 'http://http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001741739', 'language': 'he', 'text': data } functions.post_text(index_key, text_version)
def parse_general(filename): def cleaner(my_text): result = [] for line in my_text: new_line = multiple_replace(line, { u'@31': u'<b>', u'@32': u'</b>' }) new_line = re.sub(u'@[0-9]{2}', u'', new_line) result.append(new_line) return result regs = [ u'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}', u'@22[\u05d0-\u05ea]{1,2}' ] with codecs.open(filename, 'r', 'utf-8') as infile: return file_to_ja(3, infile, regs, cleaner).array()
def parse_Raph_by_letter(filename): '''parsing according to the letters, is the main ja, to post for the raph''' def cleaner(my_text): replace_dict = { u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u'' } #{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line) and not line.isspace(): line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] new_ja = regs_devide(cleaned, regs) try: # ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[True, True], group_name='gim').array() ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array() except AttributeError: print 'there are more regs then levels...' # ja_to_xml(new_ja, ['Alef', 'letter', 'segments'], 'raph_letters.xml') ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml') return ja
def parse_Raph_by_letter(filename): '''parsing according to the letters, is the main ja, to post for the raph''' def cleaner(my_text): replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line) and not line.isspace(): line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] new_ja = regs_devide(cleaned, regs) try: # ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[True, True], group_name='gim').array() ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array() except AttributeError: print 'there are more regs then levels...' # ja_to_xml(new_ja, ['Alef', 'letter', 'segments'], 'raph_letters.xml') ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml') return ja
return index def post_text_and_index(text_struct, section_names): index = build_index(section_names) functions.post_index(index) for section_num, section in enumerate(section_names): new_text = { "versionTitle": 'Noda BeYehuda Warsaw 1880', "versionSource": 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001983501', "language": 'he', "text": text_struct[section_num] } functions.post_text('Noda BeYehuda, {}'.format(section), new_text) patterns = [u'@00', u'@22'] names = [u'חלק', u'סימן', u'טקסט'] section_names = ['Orach Chaim', 'Yoreh Deah', 'Even HaEzer', 'Choshen Mishpat'] parsed = util.file_to_ja([[[]]], noda_file, patterns, clean_and_align) with codecs.open('testfile.txt', 'w', 'utf-8') as check_parse: util.jagged_array_to_file(check_parse, parsed.array(), names) post_text_and_index(parsed.array(), section_names) noda_file.close() os.remove('errors.html')
def post_text_and_index(text_struct, section_names): index = build_index(section_names) functions.post_index(index) for section_num, section in enumerate(section_names): new_text = { "versionTitle": 'Noda BeYehuda Warsaw 1880', "versionSource": 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001983501', "language": 'he', "text": text_struct[section_num] } functions.post_text('Noda BeYehuda, {}'.format(section), new_text) patterns = [u'@00', u'@22'] names = [u'חלק', u'סימן', u'טקסט'] section_names = ['Orach Chaim', 'Yoreh Deah', 'Even HaEzer', 'Choshen Mishpat'] parsed = util.file_to_ja([[[]]], noda_file, patterns, clean_and_align) with codecs.open('testfile.txt', 'w', 'utf-8') as check_parse: util.jagged_array_to_file(check_parse, parsed.array(), names) post_text_and_index(parsed.array(), section_names) noda_file.close() os.remove('errors.html')