def jaggedarray_from_file(input_file, perek_tag, mishna_tag, skip_tag): """ :param input_file: File to parse :param perek_tag: Used to identify the start of a new perek. :param mishna_tag: Identify next mishna. :return: A 2D jaggedArray to match Sefaria's format. Rough, will require more processing. """ chapters, mishnayot, current = [], [], [] found_first_chapter = False for line in input_file: # look for skip_tag if re.search(skip_tag, line): continue # look for tags new_chapter, new_mishna = re.search(perek_tag, line), re.search(mishna_tag, line) # make sure perek and mishna don't appear on the same line if new_chapter and new_mishna: print 'Mishna starts on same line as chapter\n' print '{}\n\n'.format(new_chapter.group()) input_file.close() sys.exit(1) # found chapter tag. if new_chapter: if found_first_chapter: if current != []: mishnayot.append(u' '.join(current).lstrip()) current = [] chapters.append(mishnayot) mishnayot = [] else: found_first_chapter = True continue if found_first_chapter: if new_mishna: if current != []: mishnayot.append(u' '.join(current).lstrip()) current = [util.multiple_replace(line, {u'\n': u'', u'\r': u'', new_mishna.group(): u''})] else: current.append(util.multiple_replace(line, {u'\n': u'', })) # add next line else: mishnayot.append(u''.join(current).lstrip()) chapters.append(mishnayot) return chapters
def clean_line(line): line = strip_nikkud(line) replace_dict = {u'[.:\?]': u'', u'[”״]': u'"', u'[’׳]': u"'"} #note put \. in the file/ how can i check if it is right? line = multiple_replace(line, replace_dict, using_regex=True) # line = re.sub(u'[:\?]', '', line) # line = re.sub(u'”', u'"', line) reg_parentheses = re.compile(u'\((.*?)\)') reg_brackets = re.compile(u'\[(.*?)\]') in_per = reg_parentheses.search(line) in_bra = reg_brackets.search(line) reg_ayyen_tur = re.compile(u'''ו?(עיין|עי'|ע"ש) בטור''') reg_lo_manu = re.compile(u'''(?P<a>(\u05d0\u05da )?\u05dc\u05d0 \u05de\u05e0(.*?))(\u05e1\u05de"?\u05d2|\u05e8\u05de\u05d1"?\u05dd|\u05d8\u05d5\u05e8|\n)''') line = re.sub(u'\[.*?אלפס.*?\]', u'', line) line = re.sub(u'טור ו?שו"ע', u'טוש"ע', line) f_ayyen = re.search(reg_ayyen_tur, line) f_lo_manu = re.search(reg_lo_manu, line) if f_ayyen: line = line[:f_ayyen.start()] if f_lo_manu: line = re.sub(f_lo_manu.group('a'), u"", line) if in_per: if in_bra: clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct clean = re.sub(reg_parentheses, '', clean) else: clean = re.sub(reg_parentheses, ur'\1', line) elif in_bra: clean = re.sub(reg_brackets, ur'\1', line) # brackets are always correct else: clean = line return clean
def cleaner(my_text): replace_dict = {u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}#, u'@55[\u05d0-\u05ea]{1,3}' : u'<i-tags = >'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new
def parse_en(filename): with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() ja = JaggedArray([[[[]]]]) placing = u'(\s*[0-9]{1,2}),([0-9]{1,2})-?[0-9]*\.' # the regex to find the indexing on Monk # q1, q2 = ur'“', ur'”' # Rabbi Monk uses these to enclose translation of a pasuk # dh_reg = ur'([\u05d0 - \u05ea]*), *({}.*?{})'.format(q1, q2) replace_dict = {placing: u'', u'@': ''} temp = [] indices = [0] * 3 for line in lines: pasuk_dh = re.match(placing, line) reg_dh = re.search( ur'@([\u05d0-\u05ea|\\s]*)', line) # reg_dh = re.search(ur'([\u05d0-\u05ea]+, *“.*?”)',line) line = multiple_replace(line, replace_dict, using_regex=True) if pasuk_dh or reg_dh: temp = ' '.join(temp) ja.set_element(indices, temp, []) temp = [] if pasuk_dh: indices = [ int(pasuk_dh.group(1)) - 1, int(pasuk_dh.group(2)) - 1, indices[2] ] indices[2] = 0 elif reg_dh: indices[2] += 1 if not line.isspace() and not re.match( ur' *Parshat *(\S+) *(\S+)? *', line): # don't put into array names of Parasha or empty lines temp.append(line)
def cleaner(my_text): new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new
def cleaner(my_text): replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@77': u''} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new
def cleaner(my_text): result = [] for line in my_text: new_line = multiple_replace(line, {u'@31': u'<b>', u'@32': u'</b>'}) new_line = re.sub(u'@[0-9]{2}', u'', new_line) result.append(new_line) return result
def join_singlet_tags(infile, infile_name, tag): """ Certain tags may appear on their own line when they need to be inline with the text. This function fixes this. :param infile: Input file to be edited :param infile_name: Path to file to be edited :param tag: tag to search for :return: The updated file """ infile.seek(0) temp_file_name = '{}.tmp'.format(infile_name) temp_file = codecs.open(temp_file_name, 'w', 'utf-8') replacements = {u'\r': u' ', u'\n': u' '} # clean up problematic lines then write them to temp file for line in infile: if re.match(tag, line) and len(line.split()) == 1: line = util.multiple_replace(line, replacements) line = re.sub(u' +', u' ', line) temp_file.write(line) infile.close(), temp_file.close() os.remove(infile_name) os.rename(temp_file_name, infile_name) return codecs.open(infile_name, 'r', 'utf-8')
def cleaner(my_text): replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@(33|22)': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} new = [] for line in my_text: line = multiple_replace(line,replace_dict,using_regex=True) new.append(line) return new
def cleaner(my_text): replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'} new = [] for line in my_text: line = multiple_replace(line,replace_dict,using_regex=True) new.append(line) return new
def fifty_parse(lines, replace_dict): # start the parsing of part fifty arr = [] perek = [] peska = [] new_lines = [] for line in lines: line = split_lines(line) new_lines.extend(line) for line in new_lines: if line.find(ur'@05') is not -1: if perek: perek.append(peska) peska = [] arr.append(perek) perek = [] else: if (line.find(u'@13') is not -1) and (peska): perek.append(peska) peska = [] line = multiple_replace(line, replace_dict, using_regex=True) peska.append(line) perek.append(peska) arr.append(perek) ja_to_xml(arr,['perek', 'piska', 'break'], 'raavad_50.xml') return arr
def raavad_perush_parse(lines, replace_dict): # start the parsing of part raavad text itself arr = [] first_p = True first_m = True first_d = True perek = [] mishna = [] dibur = [] for line in lines: if line.find(u'@00') is not -1: # perek if first_p: first_p = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] arr.append(perek) perek = [] first_m = True # since this is opening a new perek first_d = True elif line.find( u'@22' ) is not -1: # notice that this parsing is given that there is no text on same line with @22 and @00 # mishna if first_m: first_m = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] first_d = True # since this is opening a new mishna else: # this line is going to be part of the dibur # Dibur Hamatchil if re.search(u'@(31|98)', line) and ( not first_d ): # and not first_d: # probably start a new dibur dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] else: if first_d: first_d = False # segment ocr tag fixing line = multiple_replace(line, replace_dict, using_regex=True) dibur.append(line) dibur = ' '.join(dibur) mishna.append(dibur) perek.append(mishna) arr.append(perek) ja_to_xml(arr, ['perek', 'mishna', 'dibur'], 'raavad_text.xml') return arr
def cleaner(my_text): #todo: deal with @44 and @00 (@00 maybe should be only in smk base text? - ask Shmuel) replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@(33|77|88|99)': u'', u'@55(.*?)@66': u'<b>\1</b>'} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new
def rewrtie_csv(fromcsv, newcsv, readColumnHeader, toWriteHeaders=None): headerNames, lines = fromCSV(fromcsv, u'fixed_{}'.format(readColumnHeader), readColumnHeader) if not toWriteHeaders: toWriteHeaders = headerNames regs = { u'rambam': re.compile( u'(\u05e8\u05de\u05d1"\u05dd.*?)(?:\.|\u05d5?\u05d8\u05d5\u05e8|\u05d5?\u05e1\u05de"?\u05d2|\n)' ), u'smg': re.compile( u'(\u05e1\u05de"?\u05d2.*?)(?:\.|\u05d5?\u05d8\u05d5\u05e8|\u05d5?\u05e8\u05de\u05d1"\u05dd|\n)' ), u'tur': re.compile(u'\u05d8\u05d5\u05e8(.*?)(?:\.|:|\n|@)') } # rows = OrderedDict() rows = [] siman_cit_lines = 1 prv_siman = 1 for line_dict in lines: repdict = { u'טוא"ח': u'טור אורח חיים', u'טא"ח': u'טור אורח חיים', u'טי"ד': u'טור יורה דעה', u'טוי"ד': u'טור יורה דעה', u'טח"מ': u'טור חושן משפט', u'טוח"מ': u'טור חושן משפט' } line = multiple_replace(line_dict[u'full'], repdict) # line = line_dict[u'full'] row_dict = {u'siman': line_dict[u'siman'], u'full': line} # +u'.'} if line_dict[u'siman'] == prv_siman: siman_cit_lines += 1 else: siman_cit_lines = 1 rambam = re.search(regs[u'rambam'], line) if rambam: rambam = sarsehu(rambam.group(1).strip()) rambam = get_a_Ref_from_chopped_txt(rambam, VERBOSE=False) row_dict[u'rambam'] = rambam tur = re.search(regs[u'tur'], line) if tur: tur = tur.group(1).strip() tur = get_a_Ref_from_chopped_txt(u'טור, {}'.format(tur)) row_dict[u'tur'] = tur rows.append(row_dict) # rows[(row_dict[u'siman'], siman_cit_lines)] = row_dict prv_siman = line_dict[u'siman'] links, smgs = link_smg( u'fixed_{}'.format(readColumnHeader)) #link_smg(u'smg_smk_test') for i, (smk_siman, seg, smg) in enumerate(smgs): if smg: rows[i][u'smg'] = eval(smg) rows[i][u'smk_segment'] = seg #int(smk_siman)-1 toCSV(newcsv, rows, toWriteHeaders)
def cleaner(my_text): replace_dict = {u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1', u"@44": u""} new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new
def cleaner(my_text): result = [] for line in my_text: new_line = multiple_replace(line, { u'@31': u'<b>', u'@32': u'</b>' }) new_line = re.sub(u'@[0-9]{2}', u'', new_line) result.append(new_line) return result
def raavad_perush_parse(lines, replace_dict): # start the parsing of part raavad text itself arr = [] first_p = True first_m = True first_d = True perek = [] mishna = [] dibur = [] for line in lines: if line.find(u'@00') is not -1: # perek if first_p: first_p = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] arr.append(perek) perek = [] first_m = True # since this is opening a new perek first_d = True elif line.find(u'@22') is not -1: # notice that this parsing is given that there is no text on same line with @22 and @00 # mishna if first_m: first_m = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] first_d = True # since this is opening a new mishna else: # this line is going to be part of the dibur # Dibur Hamatchil if re.search(u'@(31|98)', line) and (not first_d):# and not first_d: # probably start a new dibur dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] else: if first_d: first_d = False # segment ocr tag fixing line = multiple_replace(line, replace_dict, using_regex=True) dibur.append(line) dibur = ' '.join(dibur) mishna.append(dibur) perek.append(mishna) arr.append(perek) ja_to_xml(arr,['perek', 'mishna', 'dibur'], 'raavad_text.xml') return arr
def before_post_cleaner(ja, replace_dict): new_ja = [] new_siman = [] for i, siman in enumerate(ja): for seg_number, seg in enumerate(siman): seg = multiple_replace(seg, replace_dict, using_regex=True) if re.search(u'<small></small>', seg): continue new_siman.append(seg) new_ja.append(new_siman) new_siman = [] return new_ja
def convert_smg(smg_str): conv_table = { u'Sefer Mitzvot Gadol, Volume One ' : u'Sefer Mitzvot Gadol, Negative Commandments ', u'Sefer Mitzvot Gadol, Volume Two ':u'Sefer Mitzvot Gadol, Positive Commandments ', u'Sefer Mitzvot Gadol, Volume Two, Laws of Eruvin ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Eruvin ', u'Sefer Mitzvot Gadol, Volume Two, Laws of Mourning ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Mourning ', u"Sefer Mitzvot Gadol, Volume Two, Laws of Tisha B'Av ": u"Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Tisha B'Av ", u'Sefer Mitzvot Gadol, Volume Two, Laws of Megillah ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Megillah ', u'Sefer Mitzvot Gadol, Volume Two, Laws of Chanukah ': u'Sefer Mitzvot Gadol, Rabbinic Commandments, Laws of Chanukah ' } return multiple_replace(smg_str, conv_table, using_regex=True)
def cleaner(my_text): replace_dict = { u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1', u"@44": u"" } new = [] for line in my_text: line = multiple_replace(line, replace_dict, using_regex=True) new.append(line) return new
def clean_and_align(section): """ Take a section of a raw parse, clean out tags and align segments. :param section: List of strings representing a raw text segment. :return: List of strings, cleaned and properly structured. """ cleaned = [] for line in section: line = re.sub(u'@[0-9]{2}', u'', line) line = re.sub(u'[!*]', u'', line) line = re.sub(u' +', u' ', line) line = util.multiple_replace(line, {u'\n': u'', u'\r': u''}) cleaned.append(line) return cleaned
def rewrtie_csv(fromcsv, newcsv, readColumnHeader, toWriteHeaders=None): headerNames, lines = fromCSV(fromcsv, u'fixed_{}'.format(readColumnHeader), readColumnHeader) if not toWriteHeaders: toWriteHeaders = headerNames regs = {u'rambam': re.compile( u'(\u05e8\u05de\u05d1"\u05dd.*?)(?:\.|\u05d5?\u05d8\u05d5\u05e8|\u05d5?\u05e1\u05de"?\u05d2|\n)'), u'smg': re.compile( u'(\u05e1\u05de"?\u05d2.*?)(?:\.|\u05d5?\u05d8\u05d5\u05e8|\u05d5?\u05e8\u05de\u05d1"\u05dd|\n)'), u'tur': re.compile(u'\u05d8\u05d5\u05e8(.*?)(?:\.|:|\n|@)')} # rows = OrderedDict() rows = [] siman_cit_lines = 1 prv_siman = 1 for line_dict in lines: repdict = {u'טוא"ח': u'טור אורח חיים', u'טא"ח': u'טור אורח חיים',u'טי"ד':u'טור יורה דעה', u'טוי"ד':u'טור יורה דעה',u'טח"מ': u'טור חושן משפט',u'טוח"מ': u'טור חושן משפט'} line = multiple_replace(line_dict[u'full'], repdict) # line = line_dict[u'full'] row_dict = {u'siman': line_dict[u'siman'], u'full': line}# +u'.'} if line_dict[u'siman'] == prv_siman: siman_cit_lines += 1 else: siman_cit_lines = 1 rambam = re.search(regs[u'rambam'], line) if rambam: rambam = sarsehu(rambam.group(1).strip()) rambam = get_a_Ref_from_chopped_txt(rambam, VERBOSE=False) row_dict[u'rambam'] = rambam tur = re.search(regs[u'tur'], line) if tur: tur = tur.group(1).strip() tur = get_a_Ref_from_chopped_txt(u'טור, {}'.format(tur)) row_dict[u'tur'] = tur rows.append(row_dict) # rows[(row_dict[u'siman'], siman_cit_lines)] = row_dict prv_siman = line_dict[u'siman'] links, smgs = link_smg(u'fixed_{}'.format(readColumnHeader)) #link_smg(u'smg_smk_test') for i, (smk_siman, seg, smg) in enumerate(smgs): if smg: rows[i][u'smg'] = eval(smg) rows[i][u'smk_segment'] = seg #int(smk_siman)-1 toCSV(newcsv, rows, toWriteHeaders)
def clean(JA, replace_dict): ''' :param JA: JA obj of the text to be cleand :param replace_dict: a dictionary of what to replace :return: cleaned JA ''' # replace_dict = {u'@23': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', # u'@66(.*?)@67': ur'\1'} # , u'@55[\u05d0-\u05ea]{1,3}' : u'<i-tags = >'} lstlst = JA.array() new = [] nd1 = [] for d1 in lstlst: for d2 in d1: nd2 = multiple_replace(d2, replace_dict, using_regex=True) nd1.append(nd2) new.append(nd1) nd1 = [] ja_to_xml(new, ['letter', 'segments'], 'clean_smk.xml') return JaggedArray(new)
def threty_two_parse(lines, replace_dict, str): # start the parsing of 32 netivot arr = [] netiv = [] first = True for line in lines: if re.search(u'@(13|03)', line):# and (netiv): if first: first = False else: netiv = ' '.join(netiv) arr.append(netiv) netiv = [] line = multiple_replace(line, replace_dict, using_regex=True) netiv.append(line.strip()) netiv = ' '.join(netiv) arr.append(netiv) ja_to_xml(arr, ['netiv'], '{}{}'.format(str,'_32.xml')) return arr
def threty_two_parse(lines, replace_dict, str): # start the parsing of 32 netivot arr = [] netiv = [] first = True for line in lines: if re.search(u'@(13|03)', line): # and (netiv): if first: first = False else: netiv = ' '.join(netiv) arr.append(netiv) netiv = [] line = multiple_replace(line, replace_dict, using_regex=True) netiv.append(line.strip()) netiv = ' '.join(netiv) arr.append(netiv) ja_to_xml(arr, ['netiv'], '{}{}'.format(str, '_32.xml')) return arr
def ari_parse(): with codecs.open("yitzira_mishna.txt", "r", "utf-8") as fp: lines = fp.readlines() parsed = [] perek = [] mishna = [] starting = None # dictionary for line ocr tag fixing replace_dict = { u"@(44)": u"<small>", u"@(45)": u"</small>", # bava in parenthesis ur"(@(11|12|66|67)|\[\*.*?\])": u"", # ocr tags that are not relevant (including erasing footnotes) } # check if we got to the end of the legend and change to started for line_num, line in enumerate(lines): if line == u"\n": starting = line_num + 1 break for line in lines[starting:]: if line.find(u"@00") == 0: if perek: mishna = " ".join(mishna) perek.append(mishna) mishna = [] parsed.append(perek) perek = [] elif line.find(u"@22") == 0: if mishna: mishna = "".join(mishna) perek.append(mishna) mishna = [] else: line = multiple_replace(line, replace_dict, using_regex=True) mishna.append(line.strip()) mishna = " ".join(mishna) perek.append(mishna) parsed.append(perek) # ja_to_xml(parsed,['perek', 'mishna']) return parsed
def ari_parse(): with codecs.open('yitzira_mishna.txt', 'r', 'utf-8') as fp: lines = fp.readlines() parsed = [] perek = [] mishna = [] starting = None # dictionary for line ocr tag fixing replace_dict = {u'@(44)': u'<small>', u'@(45)':u'</small>', # bava in parenthesis ur'(@(11|12|66|67)|\[\*.*?\])': u'' # ocr tags that are not relevant (including erasing footnotes) } # check if we got to the end of the legend and change to started for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break for line in lines[starting:]: if line.find(u'@00') == 0: if perek: mishna = ' '.join(mishna) perek.append(mishna) mishna = [] parsed.append(perek) perek = [] elif line.find(u'@22') == 0: if mishna: mishna = ''.join(mishna) perek.append(mishna) mishna = [] else: line = multiple_replace(line, replace_dict, using_regex=True) mishna.append(line.strip()) mishna = ' '.join(mishna) perek.append(mishna) parsed.append(perek) # ja_to_xml(parsed,['perek', 'mishna']) return parsed
def structure_boaz(chapter): new_comment = re.compile(u'@22') break_tag = re.compile(u'@23') skip_tag = re.compile(u'@99') parsed = [] for line in chapter: line = util.multiple_replace(line, {u'\n': u'', u'\r': u''}) if new_comment.match(line): parsed.append(line) elif break_tag.match(line): line = line.replace(break_tag.pattern, u'<br>') parsed[-1] += line elif skip_tag.match(line) or line == u'': continue else: parsed[-1] += u' {}'.format(line) return parsed
def fifty_parse(lines, replace_dict): # start the parsing of part fifty arr = [] perek = [] peska = [] new_lines = [] for line in lines: line = split_lines(line) new_lines.extend(line) for line in new_lines: if line.find(ur'@05') is not -1: if perek: perek.append(peska) peska = [] arr.append(perek) perek = [] else: if (line.find(u'@13') is not -1) and (peska): perek.append(peska) peska = [] line = multiple_replace(line, replace_dict, using_regex=True) peska.append(line)
def text_parse(): # open, read, close the original txt file with codecs.open('yitzira_gra.txt', 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break # init section lists and flags parsed = [] perek = [] mishna = [] dibur = [] first_p = True # first perek flag first_m = True # first mishna flag first_d = True # first dibur flag ofen = False # 'ofen' flag # dictionary for line ocr tag fixing replace_dict = {u'@03': u'<b>', u'@04': u'</b><br>', # title 'Ofen' in the gra's commentary u'@11': u'', # not necessary ocr tag u'@31': u'<b>', u'@32': u'</b>', # bold dibur hamatchil u'@44': u'<b>', u'@45': u'</b>', # was bold in text u'@98': u'<small>', u'@99': u'</small>', # the slik at the end ur'\*\[(.*?)\]': ur'<small>[\1]</small>' # footnotes } # loop on lines and creat the jagged array for line in lines[starting:]: if line.find(u'@00') is not -1: # perek if first_p: first_p = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] parsed.append(perek) perek = [] first_m = True # since this is opening a new perek elif line.find(u'@22') == 0: # notice that this parsing is given that there is no text on same line with @22 and @00 # mishna if first_m: first_m = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] first_d = True # since this is opening a new mishna else: # this line is going to be part of the dibur # Dibur Hamatchil if re.search(u'@(03|31|98)', line): # probably start a new dibur if (not ofen) and (not first_d): # prob close prev dibur dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] else: if ofen: ofen = False if first_d: first_d = False if re.search(u'@03', line): ofen = True # segment ocr tag fixing line = multiple_replace(line, replace_dict, using_regex = True) dibur.append(line) # once reached the end close all what was opened dibur = ' '.join(dibur) mishna.append(dibur) perek.append(mishna) parsed.append(perek) # ja_to_xml(parsed,['perek','mishna','dibur'],filename = 'gra.xml') return parsed
def jaggedarray_from_file(input_file, perek_tag, mishna_tag, skip_tag): """ :param input_file: File to parse :param perek_tag: Used to identify the start of a new perek. :param mishna_tag: Identify next mishna. :return: A 2D jaggedArray to match Sefaria's format. Rough, will require more processing. """ chapters, mishnayot, current = [], [], [] found_first_chapter = False for line in input_file: # look for skip_tag if re.search(skip_tag, line): continue # look for tags new_chapter, new_mishna = re.search(perek_tag, line), re.search(mishna_tag, line) # make sure perek and mishna don't appear on the same line if new_chapter and new_mishna: print 'Mishna starts on same line as chapter\n' print '{}\n\n'.format(new_chapter.group()) input_file.close() sys.exit(1) # found chapter tag. if new_chapter: if found_first_chapter: if current != []: mishnayot.append(u' '.join(current).lstrip()) current = [] chapters.append(mishnayot) mishnayot = [] else: found_first_chapter = True continue if found_first_chapter: if new_mishna: if current != []: mishnayot.append(u' '.join(current).lstrip()) current = [ util.multiple_replace(line, { u'\n': u'', u'\r': u'', new_mishna.group(): u'' }) ] else: current.append(util.multiple_replace(line, { u'\n': u'', })) # add next line else: mishnayot.append(u''.join(current).lstrip()) chapters.append(mishnayot) return chapters
def text_parse(): # open, read, close the original txt file with codecs.open('yitzira_gra.txt', 'r', 'utf-8') as fp: lines = fp.readlines() starting = None # check if we got to the end of the legend and change to started for line_num, line in enumerate(lines): if line == u'\n': starting = line_num + 1 break # init section lists and flags parsed = [] perek = [] mishna = [] dibur = [] first_p = True # first perek flag first_m = True # first mishna flag first_d = True # first dibur flag ofen = False # 'ofen' flag # dictionary for line ocr tag fixing replace_dict = { u'@03': u'<b>', u'@04': u'</b><br>', # title 'Ofen' in the gra's commentary u'@11': u'', # not necessary ocr tag u'@31': u'<b>', u'@32': u'</b>', # bold dibur hamatchil u'@44': u'<b>', u'@45': u'</b>', # was bold in text u'@98': u'<small>', u'@99': u'</small>', # the slik at the end ur'\*\[(.*?)\]': ur'<small>[\1]</small>' # footnotes } # loop on lines and creat the jagged array for line in lines[starting:]: if line.find(u'@00') is not -1: # perek if first_p: first_p = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] parsed.append(perek) perek = [] first_m = True # since this is opening a new perek elif line.find( u'@22' ) == 0: # notice that this parsing is given that there is no text on same line with @22 and @00 # mishna if first_m: first_m = False else: dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] perek.append(mishna) mishna = [] first_d = True # since this is opening a new mishna else: # this line is going to be part of the dibur # Dibur Hamatchil if re.search(u'@(03|31|98)', line): # probably start a new dibur if (not ofen) and (not first_d): # prob close prev dibur dibur = ' '.join(dibur) mishna.append(dibur) dibur = [] else: if ofen: ofen = False if first_d: first_d = False if re.search(u'@03', line): ofen = True # segment ocr tag fixing line = multiple_replace(line, replace_dict, using_regex=True) dibur.append(line) # once reached the end close all what was opened dibur = ' '.join(dibur) mishna.append(dibur) perek.append(mishna) parsed.append(perek) # ja_to_xml(parsed,['perek','mishna','dibur'],filename = 'gra.xml') return parsed
def _extract_important_data(self): book_names, parsha_names = [], [] books, parashot, sections, segments = [], None, None, None def start_condition(html_fragment): soup = html_fragment if soup.u is not None: if soup.u.text == u'ספר בראשית': return True return False def text_quote(html_fragment): soup = html_fragment if soup.div is None: return False else: return True def new_parsha(html_frament): soup = html_frament if soup.u is None: return False else: if re.search(u'פרשת ', soup.u.text): return True else: return False def new_book(html_frament): soup = html_frament if soup.u is None: return False else: if re.search(u'ספר', soup.u.text): return True else: return False text_started = False for line in self.lines: line = multiple_replace(line, {u'\n': u'', u'\r': u''}) if re.match(u'<B', line) is None: continue soup = BeautifulSoup(line, 'html5lib') if text_started: if new_book(soup): # add book name book_names.append(soup.u.text) if parashot is not None: sections.append(segments) parashot.append(sections) books.append(parashot) parashot, sections, segments = [], None, None elif new_parsha(soup): parsha_names.append(soup.u.text) if sections is not None: sections.append(segments) parashot.append(sections) sections, segments = [], None if text_quote(soup): if segments is not None: sections.append(segments) segments = [soup.div.text] elif text_quote(soup): if segments is not None: sections.append(segments) segments = [soup.div.text] else: if soup.text == u'': continue else: segments.append(soup.text) else: text_started = start_condition(soup) if text_started: book_names.append(u'ספר בראשית') parashot = [] else: sections.append(segments) parashot.append(sections) books.append(parashot) return { 'book names': book_names, 'parsha names': parsha_names, 'full_text': books }