def parse_hagahot_by_letter(filename): def cleaner(my_text): replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u''} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'\$', line) and not line.isspace(): line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] try: ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[True, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' new_ja = regs_devide(cleaned, regs, u'(נשלם מלאכת שבעת הימים)') ja_to_xml(new_ja, ['siman', 'letter', 'segments'], 'hagahot_letters.xml') return new_ja
def parse_hagahot_by_letter(filename): def cleaner(my_text): replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@77': u''} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'\$', line) and not line.isspace(): line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] try: ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[True, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' new_ja = regs_devide(cleaned, regs, u'(נשלם מלאכת שבעת הימים)') ja_to_xml(new_ja, ['siman', 'letter', 'segments'], 'hagahot_letters.xml') return new_ja
def parse_he(filename): """ :returns a dictionary, key: name of book, value: JaggadArray obj of the ja for the book """ replace_dict = { u'@(11|44|99)': u'<b>', u'@(33|55)': u'</b>', ur'@22\(([\u05d0-\u05ea]{1,3})\)': u'', ur'@(22|77)': u'' } def cleaner(my_text): new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ ur'@00(?P<gim>)', ur'@02(?P<gim>[\u05d0-\u05ea]{1,3})', ur'@22\((?P<gim>[\u05d0-\u05ea]{1,3})\)' ] # ,ur'@77' with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of Parasha start with @01 cleaned = [] dh_list = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 if starting and not re.search(u'@01', line) and not line.isspace(): dh_recognize = re.compile(ur'@11(.*?)@33') if dh_recognize.search(line): dh_list.append(dh_recognize.search(line).group(1)) line = re.sub(dh_recognize, ur'#<b>\1</b>', line) line = re.split(ur'#', line) if isinstance(line, basestring): cleaned.append(line) else: cleaned.extend(line) tt_ja = file_to_ja_g(4, cleaned, regs, cleaner, gimatria=True, group_name='gim', grab_all=[False, False, False]).array() Pentateuch = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy'] parsed_texts = dict({book: ja for book, ja in zip(Pentateuch, tt_ja)}) for book, ja in zip(Pentateuch, tt_ja): ja_to_xml(ja, ['perek', 'pasuk', 'comment'], 'tur_{}.xml'.format(book)) # for str in dh_list: # print str return parsed_texts
def test_file_to_ja_g(): data = StringIO('''@22א\nfoo\nbar\n@22ג\nhello\nworld''') ja = util.file_to_ja_g(2, data, [r'@22(?P<gim>[\u05d0-\u05ea])'], lambda x: [c.rstrip() for c in x], True) assert ja.array() == [ ['foo', 'bar'], [], ['hello', 'world'] ]
def parse_semak(filename): def cleaner(my_text): replace_dict = { u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1', u"@44": u"" } new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@22(?P<gim>[\u05d0-\u05ea]{1,3})'] #, u'@(11|23|33)(?P<gim>)'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] letter_section = [] alt_day = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if re.search(u'@00', line): alt_day.append(line_num) if not re.search(u'@00', line) and not line.isspace(): if re.search(u'@22', line): line = re.split(u'(@22[\u05d0-\u05ea]{1,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st) for st in line if st] else: cleaned.append(line) alt_day.append(len(lines)) print alt_day try: smk_ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[False, True, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(smk_ja, ['letter', 'segments'], 'smk.xml') return smk_ja
def parse_smk(filename): ''' :param filename: smk source txt file :return: JA obj smk parsed to depth 2 [siman, segment] (including a citation segment at the top of each siman) ''' def cleaner(my_text): replace_dict = {u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} # , u'@55[\u05d0-\u05ea]{1,3}' : u'<i-tags = >'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@22(?P<gim>[\u05d0-\u05ea]{1,3})'] # , u'@(11|23|33)(?P<gim>)'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] letter_section = [] alt_day = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if re.search(u'@00', line): alt_day.append(line_num) if not re.search(u'@00', line) and not line.isspace(): if re.search(u'@22', line): line = re.split(u'(@22[\u05d0-\u05ea]{1,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st) for st in line if st] else: cleaned.append(line) alt_day.append(len(lines)) print alt_day try: smk_ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[False, True, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(smk_ja, ['letter', 'segments'], 'smk.xml') return JaggedArray(smk_ja)
def parse_Raph(filename): def cleaner(my_text): replace_dict = { u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u'' } #{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ ur'@77(?P<gim>[\u05d0-\u05ea]{0,3})', ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})' ] # (?P<gim>[\u05d0-\u05ea]{1,3}) with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] letter_section = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line) and not line.isspace(): line = re.split(u'(@(?:77|11)[\u05d0-\u05ea]{0,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] #(st and not re.search(u'@(77)', st))] # else: # cleaned.append(line) try: ja = file_to_ja_g(3, cleaned, regs, cleaner, gimatria=True, grab_all=[False, False, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(ja, ['page', 'letter', 'segments'], 'raph.xml') return ja
def parse_hagahot(filename, smk_ja, raph_ja): ''' :param filename: hagahot source txt file :param smk_ja: smk JA obj [siman, segment] :param raph_ja: raph JA obj [siman, letter] :return: JA obj ''' ja_hagahot = [] def cleaner(my_text): #todo: deal with @44 and @00 (@00 maybe should be only in smk base text? - ask Shmuel) replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@(33|77|88|99)': u'', u'@55(.*?)@66': u'<b>\1</b>'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)'] with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line): line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line) if isinstance(line, basestring) and line != u'': cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st] try: ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[False], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(ja, ['siman', 'letter'], 'hagahot_letters_25.xml') #, 'segments' # for hghds in return JaggedArray(ja_hagahot)
def parse_Raph(filename): def cleaner(my_text): replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} new = [] for line in my_text: line = multiple_replace(line,replace_dict,using_regex=True) new.append(line) return new regs = [ur'@77(?P<gim>[\u05d0-\u05ea]{0,3})', ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})'] # (?P<gim>[\u05d0-\u05ea]{1,3}) with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started # clean all lines of days start with @00 cleaned = [] letter_section = [] for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line_num, line in enumerate(lines[starting:]): if not re.search(u'@00', line) and not line.isspace(): line = re.split(u'(@(?:77|11)[\u05d0-\u05ea]{0,3})', line) if isinstance(line, basestring): cleaned.append(line) else: [cleaned.append(st.strip()) for st in line if st]#(st and not re.search(u'@(77)', st))] # else: # cleaned.append(line) try: ja = file_to_ja_g(3, cleaned, regs, cleaner, gimatria=True, grab_all=[False, False, True], group_name='gim').array() except AttributeError: print 'there are more regs then levels...' ja_to_xml(ja, ['page', 'letter', 'segments'], 'raph.xml') return ja
for the_file in [x for x in os.listdir(folder) if "xml" not in x]: name = the_file.replace(".txt", "").split("-")[2] sefer = the_file.split("-")[1] if sefer == u"ספר קרבנות": sefer = u"ספר קורבנות" if sefer == u"ספר קנין": sefer = u"ספר קניין" sefarim.add(sefer) if name in processed: # Skip second version of Nashim continue file_path = os.path.join(folder, the_file) with codecs.open(file_path, "r", "utf-8") as infile: j = file_to_ja_g(3, infile, regexes, clean_segments, gimatria=True) processed[name] = {"cat": sefer, "text": j.array()} ja_to_xml(j.array(), ["Chapter","Halacha","Comment"], file_path.replace("txt","xml")) processed[u"הלכות תפילה וברכת כהנים"] = { "cat": processed[u"הלכות תפלה"]["cat"], "text": processed[u"הלכות תפלה"]["text"][:13] + processed[u"הלכות נשיאת כפים"]["text"][13:] } del processed[u"הלכות תפלה"] del processed[u"הלכות נשיאת כפים"] processed[u"הלכות תפילין ומזוזה וספר תורה"] = {
return [clean_segment(s) for s in segments if clean_segment(s)] processed = {} sefarim = set() regexes = [chapter_regex, halacha_regex] for the_file in [x for x in os.listdir(folder) if "xml" not in x]: name = the_file.replace(".txt", "").split("-")[2] sefer = the_file.split("-")[1] sefarim.add(sefer) if name in processed: # Skip second version of Nashim continue file_path = os.path.join(folder, the_file) with codecs.open(file_path, "r", "utf-8") as infile: j = file_to_ja_g(3, infile, regexes, clean_segments, gimatria=True) processed[name] = {"cat": sefer, "text": j.array()} ja_to_xml(j.array(), ["Chapter","Halacha","Comment"], file_path.replace("txt","xml")) processed[u"הלכות שופר וסוכה ולולב"] = { "cat": processed[u"הלכות שופר"]["cat"], "text": processed[u"הלכות שופר"]["text"][:3] + processed[u"הלכות סוכה"]["text"][3:6] + processed[u"הלכות לולב"]["text"][6:] } del processed[u"הלכות שופר"] del processed[u"הלכות סוכה"] del processed[u"הלכות לולב"] processed[u"הלכות מגילה וחנוכה"] = { "cat": processed[u"הלכות מגילה"]["cat"],