def parse(): with codecs.open('pardes_rimonim.html', 'r', 'windows-1255') as infile: lines = infile.readlines() gate, chapter, whole_text = -1, -1, [] root = JaggedArray([[]]) found_beginning = False beginning = re.compile( ur'^<b>\u05e9\u05e2\u05e8 ([\u05d0-\u05ea]{1,2}) \u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})' ) for line in lines: match = beginning.search(line) if match: if found_beginning: if re.search(ur'^\u05e4\u05e8\u05e7', whole_text[0]): # strip out some unnecessary text root.set_element([gate, chapter], whole_text[1:], pad=[]) else: root.set_element([gate, chapter], whole_text, pad=[]) whole_text = [] else: found_beginning = True new_gate, new_chapter = getGematria( match.group(1)) - 1, getGematria(match.group(2)) - 1 if new_gate - gate > 1 or new_chapter - chapter > 1: print 'skip found at Gate {} Chapter {}'.format( new_gate + 1, new_chapter + 1) gate, chapter = new_gate, new_chapter elif found_beginning: if re.search(ur'<img', line): whole_text[-1] = add_image(line, whole_text[-1]) continue
def link_semak_raph(smk_ja, raph_ja): #if segment in smak_ja has a @55[\u05d0-\u05ea]{0,3} extract the letter and match it to the segment in the ja_raph #by running on the ja_raph segments smk_raph = [] raph_letter = [] for seg in traverse_ja(smk_ja): if re.search(u'@55[\u05d0-\u05ea]{0,3}', seg['data']): for letter in re.findall(u'@55([\u05d0-\u05ea]{0,3})', seg['data']): # smk_raph.append([seg['indices'][:], letter]) smk_raph.append([letter, seg['indices']]) last = [-1, -1] for seg in traverse_ja(raph_ja): if seg['indices'][0:2] == last[0:2]: continue else: raph_letter.append(seg) last = seg['indices'] problem_count = 0 for smk, raph in zip(smk_raph, raph_letter): if getGematria(smk[0]) == (raph['indices'][1]+1): print getGematria(smk[0]), raph['indices'][1]+1, \ [item+1 for item in smk[1]], [item +1 for item in raph['indices']] else: problem_count +=1 print 'problem:', getGematria(smk[0]), raph['indices'][1]+1,\ [item+1 for item in smk[1]], [item +1 for item in raph['indices']] print problem_count
def parse(): with codecs.open('pardes_rimonim.html', 'r', 'windows-1255') as infile: lines = infile.readlines() gate, chapter, whole_text = -1, -1, [] root = JaggedArray([[]]) found_beginning = False beginning = re.compile(ur'^<b>\u05e9\u05e2\u05e8 ([\u05d0-\u05ea]{1,2}) \u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})') for line in lines: match = beginning.search(line) if match: if found_beginning: if re.search(ur'^\u05e4\u05e8\u05e7', whole_text[0]): # strip out some unnecessary text root.set_element([gate, chapter], whole_text[1:], pad=[]) else: root.set_element([gate, chapter], whole_text, pad=[]) whole_text = [] else: found_beginning = True new_gate, new_chapter = getGematria(match.group(1))-1, getGematria(match.group(2))-1 if new_gate - gate > 1 or new_chapter - chapter > 1: print 'skip found at Gate {} Chapter {}'.format(new_gate+1, new_chapter+1) gate, chapter = new_gate, new_chapter elif found_beginning: if re.search(ur'<img', line): whole_text[-1] = add_image(line, whole_text[-1]) continue
def scrape_wiki(): url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A" page = requests.get(url) soup_body = BeautifulSoup(page.text, "lxml") tables = soup_body.select(".mw-parser-output > table") pairs = [] links = [] for table in tables: table_tr = table.select("tr") for col in table_tr: pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip())) for pair in pairs: if re.search(u'ספר|מספר', pair[0]): continue neg_pos = u"Negative Mitzvot" if re.search(u"לאו", pair[1]) else u'Positive Mitzvot' rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip()) chinukh = getGematria(pair[0]) print chinukh, rambam chinukh_simanlen = len(Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs()) print neg_pos link = ({"refs": [ u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen), u'Mishneh Torah, {}.{}'.format(neg_pos, rambam) ], "type": "Sifrei Mitzvot", "auto": True, "generated_by": "chinukh_rambam_sfm_linker" # _sfm_linker what is this parametor intended to be? }) print link['refs'] links.append(link) return links
def parse_Raph_simanim(alinged_list): ''' note: although there is (not often) a differentiation in the original txt file, raph letters can be divided into smaller segments. In this code we combined those segments. returning, every raph letter as a line. ''' ja = [] siman = [] i = 1 prev_siman = u'א' for obj in alinged_list: if obj['siman'] == prev_siman: siman.append(obj['raph']) continue else: ja.append(siman) while getGematria(obj['siman']) != (getGematria(prev_siman) + i): ja.append([]) i += 1 i = 1 siman = [] siman.append(obj['raph']) prev_siman = obj['siman'] ja.append(siman) ja_to_xml(ja, ['siman', 'letter'], 'raph_simanim.xml') return ja
def xmlify(filename): """ create an xml representation of the text files :param filename: str name of file """ with codecs.open(filename, 'r', 'utf-8') as infile: raw_rambam = infile.read() chap_index = [getGematria(i.group(1)) for i in re.finditer(ur'@00\u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})', raw_rambam)] chapters = re.split(ur'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}', raw_rambam)[1:] assert len(chap_index) == len(chapters) soup = BeautifulSoup(u'<root></root>', 'xml') for index, chapter in zip(chap_index, chapters): x_chapter = soup.new_tag('chapter', num=unicode(index)) soup.root.append(x_chapter) v_indices = [getGematria(i.group(1)) for i in re.finditer(ur'@22([\u05d0-\u05ea]{1,2})', chapter)] verses = re.split(ur'@22[\u05d0-\u05ea]{1,2}', chapter)[1:] assert len(v_indices) == len(verses) for v_index, verse in zip(v_indices, verses): x_verse = soup.new_tag('verse', num=unicode(v_index)) comments = verse.splitlines() for i, comment in enumerate(comments[1:]): x_comment = soup.new_tag('comment', num=unicode(i+1)) x_comment.append(comment) x_verse.append(x_comment) x_chapter.append(x_verse) with codecs.open('./xml/{}'.format(filename.replace('.txt', '.xml')), 'w', 'utf-8') as outfile: outfile.write(unicode(soup.prettify()))
def siman_smk_exctractor(smk_text): split = re.split(u'\s', smk_text) simanim = [] for word in split: if not word or word == u'סימן' or word == u'סעיף': continue word = re.sub(u"[;.,']", u"", word) if re.search(u'-', word): borders = re.search(u"(.*?)-(.*)", word) start = getGematria(borders.group(1)) end = getGematria(borders.group(2)) for siman in range(start, end+1): simanim.append(siman) if not is_hebrew_number(word): if not check_vav(word): # print smk_text, simanim return simanim else: simanim.append(check_vav(word)) else: smk_siman = getGematria(word) simanim.append(smk_siman) # print smk_text, simanim return simanim
def link_semak_raph(smk_ja, raph_ja): #if segment in smak_ja has a @55[\u05d0-\u05ea]{0,3} extract the letter and match it to the segment in the ja_raph #by running on the ja_raph segments smk_raph = [] raph_letter = [] for seg in traverse_ja(smk_ja): if re.search(u'@55[\u05d0-\u05ea]{0,3}', seg['data']): for letter in re.findall(u'@55([\u05d0-\u05ea]{0,3})', seg['data']): # smk_raph.append([seg['indices'][:], letter]) smk_raph.append([letter, seg['indices']]) last = [-1, -1] for seg in traverse_ja(raph_ja): if seg['indices'][0:2] == last[0:2]: continue else: raph_letter.append(seg) last = seg['indices'] problem_count = 0 for smk, raph in zip(smk_raph, raph_letter): if getGematria(smk[0]) == (raph['indices'][1] + 1): print getGematria(smk[0]), raph['indices'][1]+1, \ [item+1 for item in smk[1]], [item +1 for item in raph['indices']] else: problem_count += 1 print 'problem:', getGematria(smk[0]), raph['indices'][1]+1,\ [item+1 for item in smk[1]], [item +1 for item in raph['indices']] print problem_count
def chapter_verse(text_fragment): searcher = re.compile( u'.*B.*-([\u05d0-\u05ea]{1,2})-\{([\u05d0-\u05ea]{1,2})\}') data = searcher.search(text_fragment) return { 'chapter': util.getGematria(data.group(1)), 'verse': util.getGematria(data.group(2)) }
def getMishnah(line): if line.find("@22") == 0: line = line.split(" ")[0].replace("@22", "") return getGematria(line) else: first_word = line.replace("@11", "").split(" ")[0] if len(first_word) == 1: return getGematria(first_word) return None
def getGematriaVav(str): str = str.strip() str = re.sub(u'''"|''', u'', str) case_set = {270,272,274,275,298,304,344,670,672,698,744} # from trello card 'Letter transpositions' if str[0] == u'ו' and (is_hebrew_number(str[1:]) or (getGematria(str[1:]) in case_set)): return getGematria(str[1:]) elif is_hebrew_number(str) or getGematria(str) in case_set: # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str) return getGematria(str) else: # mass.ErrorFile.write('error in pointer, not Gimatria...') print 'error in pointer, not Gimatria...', str
def walk_through_file(self, filename): """ Derive and store references from a single file :param filename: :return: """ tester = Tester() previous_reference, seif = None, 0 with codecs.open(filename, 'r', 'utf-8') as fp: lines = fp.readlines() for line in lines: if tester(re.search(u'@22([\u05d0-\u05ea]{1,3})', line)): siman = getGematria(tester.match.group(1)) seif = 0 previous_reference = self.get_default_reference(siman) if re.match(u'^@00\(', line): seif += 1 reference = { u'siman': siman, u'local-seif': seif, u'remote-seif': None, u'comments-on': None, u'raw-text': line } stripped = re.sub(u'[^\u05d0-\u05ea ]', u'', line) stripped = re.sub( u'^\u05e1\u05d9(?:\u05de\u05df)?\s([\u05d0-\u05ea]{1,3})\s?', u'', stripped) ref_match = self.reference_regex.match(stripped) if not ref_match: print u"No match found for:" print line continue reference[u'comments-on'] = self.get_commentator(ref_match) reference[u'remote-seif'] = \ None if ref_match.group(u'seif') is None else getGematria(ref_match.group(u'seif')) if reference[u'comments-on'] is None: reference[u'comments-on'] = previous_reference[ u'comments-on'] else: previous_reference[u'comments-on'] = reference[ u'comments-on'] if reference[u'remote-seif'] is None: reference[u'remote-seif'] = previous_reference[ u'remote-seif'] else: previous_reference[u'remote-seif'] = reference[ u'remote-seif'] if reference[u'remote-seif'] is None: print u'No remote seif for {} {}'.format( reference[u'siman'], reference[u'local-seif']) self.record_list.append(reference)
def getGematriaVav(str, mass): str = str.strip() str = re.sub(u'''"|''', u'', str) case_set = {270,272,274,275,298,304,344,670,672,698,744} # from trello card 'Letter transpositions' if str[0] == u'ו' and (is_hebrew_number(str[1:]) or (getGematria(str[1:]) in case_set)): return getGematria(str[1:]) elif is_hebrew_number(str) or getGematria(str) in case_set: # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str) return getGematria(str) elif re.search(u'בהגהה?', str): # this is not gimatria but there is no need to send an error about it each time... return else: mass.write_shgia('error in pointer, not Gimatria...'+ str)
def create_alt_struct_dict(rabbeinu_bahya_text_file, the_regex): first_perek, first_pasuk, current_perek, current_pasuk = 0, 0, 0, 0 second_to_last_pasuk, second_to_last_comment_number = 0, 0 first_comment_number, current_comment_number = 0, 0 new_first_perek, new_first_pasuk, new_comment = True, True, True list_of_ranges = [] with codecs.open(rabbeinu_bahya_text_file, 'r', 'utf-8') as the_file: for each_line in the_file: if "@99" in each_line: #list_of_ranges.append('{}.{}.{}-{}.{}.{}'.format(first_perek, first_pasuk, first_comment_number, current_perek, current_pasuk, current_comment_number)) new_first_perek, new_first_pasuk, right_after_99 = True, True, True first_perek = 0 elif "@00" in each_line: list_of_ranges.append('{}.{}.{}-{}.{}.{}'.format(first_perek, first_pasuk, first_comment_number, current_perek, current_pasuk, second_to_last_comment_number)) new_first_perek, new_first_pasuk = True, True elif "@77" in each_line: list_of_ranges.append('{}.{}.{}-{}.{}.{}'.format(first_perek, first_pasuk, first_comment_number, current_perek, current_pasuk, second_to_last_comment_number)) #new_first_perek, new_first_pasuk = True, True elif "@01" in each_line: matchObject = the_regex.search(each_line) if new_first_perek: matchObject = the_regex.search(each_line) first_perek = util.getGematria(matchObject.group(1)) new_first_perek = False current_perek = util.getGematria(matchObject.group(1)) elif "@22" in each_line: matchObject = the_regex.search(each_line) if new_first_pasuk: matchObject = the_regex.search(each_line) first_pasuk = util.getGematria(matchObject.group(1)) new_first_pasuk = False new_comment = True if new_comment: first_comment_number = current_comment_number second_to_last_pasuk = current_pasuk current_pasuk = util.getGematria(matchObject.group(1)) second_to_last_comment_number = current_comment_number current_comment_number = 0 else: current_comment_number += 1 return list_of_ranges
def getGematriaVav(str): str = str.strip() str = re.sub(u'''"|''', u'', str) case_set = {270, 272, 274, 275, 298, 304, 344, 670, 672, 698, 744} # from trello card 'Letter transpositions' if str[0] == u'ו' and (is_hebrew_number(str[1:]) or (getGematria(str[1:]) in case_set)): return getGematria(str[1:]) elif is_hebrew_number(str) or getGematria( str ) in case_set: # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str) return getGematria(str) else: # mass.ErrorFile.write('error in pointer, not Gimatria...') print 'error in pointer, not Gimatria...', str
def grab_rashis(self): rashis = [] for span in self.parsed_html.find_all('span', id='katom'): if span.text == u'\n': continue verse = {'comments': []} # grab the verse number match = re.search(u'\(([\u05d0-\u05ea]{1,2})\)', span.text) if match is None: verse['verse_number'] = '<unknown>' else: verse['verse_number'] = util.getGematria(match.group(1)) structured_rashi = self.structure_rashi(span.text) for line in structured_rashi: if line is not u'': # add all Siftei Hakhamim in an array according to each Rashi comment. verse['comments'].append(re.findall(u'\[([\u05d0-\u05ea])\]', line)) verse['total_rashis'] = len(structured_rashi) rashis.append(verse) return rashis
def grab_rashis(self): rashis = [] for span in self.parsed_html.find_all('span', id='katom'): if span.text == u'\n': continue verse = {'comments': []} # grab the verse number match = re.search(u'\(([\u05d0-\u05ea]{1,2})\)', span.text) if match is None: verse['verse_number'] = '<unknown>' else: verse['verse_number'] = util.getGematria(match.group(1)) structured_rashi = self.structure_rashi(span.text) for line in structured_rashi: if line is not u'': # add all Siftei Hakhamim in an array according to each Rashi comment. verse['comments'].append( re.findall(u'\[([\u05d0-\u05ea])\]', line)) verse['total_rashis'] = len(structured_rashi) rashis.append(verse) return rashis
def find_skips(filename): """ Looks for skipped comments. :param filename: File to scan """ parser = TextParser(filename) offset = 0 total_errors = 0 for chapter in parser.chapter_strings: chap_number = util.getGematria( parser.chap_reg.search(chapter).group(1)) if chap_number == 1: offset = 0 comments = parser.comment_reg.findall(chapter) comment_values = [letters[comment[1]] for comment in comments] sequence = modulo_sequence(comment_values, 22, offset) offset = comment_values[-1] + 1 if sequence['in_order']: continue else: print 'error in chapter {}'.format(chap_number) for error in sequence['errors']: print 'previous: {} expected: {} found: {}'.format( error['previous'], error['expected'], error['found']) total_errors += len(sequence['errors']) print 'total errors: {}'.format(total_errors)
def align_comments(text_array): # strip out unnecessary lines remove = re.compile(u'@99') for index, line in enumerate(text_array): if remove.search(line): del text_array[index] section_name, result, tmp = '', {}, [] t = u''.join(text_array) t = t.replace(u'\n', u'') t = t.replace(u'\r', u'') t = t.split(u' ') for word in t: search = re.search(u'@11([\u05d0-\u05ea"]){1,4}\*?\)', word) if search: section_name = getGematria(search.group(1).replace(u'"', u'')) if section_name in result.keys(): result[section_name].append(u'\n') if section_name not in result.keys(): result[section_name] = [] result[section_name].append(re.sub(u'@[0-9]{2}', u'', word)) # return result
def find_skips(filename): """ Looks for skipped comments. :param filename: File to scan """ parser = TextParser(filename) offset = 0 total_errors = 0 for chapter in parser.chapter_strings: chap_number = util.getGematria(parser.chap_reg.search(chapter).group(1)) if chap_number == 1: offset = 0 comments = parser.comment_reg.findall(chapter) comment_values = [letters[comment[1]] for comment in comments] sequence = modulo_sequence(comment_values, 22, offset) offset = comment_values[-1]+1 if sequence['in_order']: continue else: print 'error in chapter {}'.format(chap_number) for error in sequence['errors']: print 'previous: {} expected: {} found: {}'.format( error['previous'], error['expected'], error['found']) total_errors += len(sequence['errors']) print 'total errors: {}'.format(total_errors)
def get_civil_year(year_line, book): """ JN are named by year. The he_title can be lifted directly from the text, this function converts them to English equivalent. The conversion is not exact, as an exact mapping of Parsha - Date is not available at this time. Therefore, each book will get a "typical" Hebrew data which is used to extract the standard civil date. This may contain several errors, which will be corrected down the road. :param year_line: A line of text from which year data is extracted. May contain multiple years (i.e. תרל"ז-תרל"ח) :param book: What book is this taken from (i.e. Genesis, Exodus etc.). :return: civil year(s) """ typical_dates = { 'Genesis': [7, 1], 'Exodus': [10, 20], 'Leviticus': [1, 1], 'Numbers': [3, 20], 'Deuteronomy': [5, 1] } he_years = [ util.getGematria(match) + 5000 for match in re.findall(u'[\u05d0-\u05ea"]{4,5}', year_line) ] en_years = [str(year) for year in he_years] return '; '.join(en_years)
def check_segments(): segments = [] infile = codecs.open(filename, 'r', 'utf-8') headers = TagTester(u'@30', infile, u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header() tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)') while not tester.eof: segments.append( tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1)) infile.close() for sec_number, section in enumerate(segments): index = 1 for title in section: title = title.replace(u'"', u'') count = util.getGematria(title) if count != index: print headers[sec_number - 1] print util.numToHeb(index) index = count index += 1
def check_segments(): segments = [] infile = codecs.open(filename, 'r', 'utf-8') headers = TagTester(u'@30', infile, u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header() tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)') while not tester.eof: segments.append(tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1)) infile.close() for sec_number, section in enumerate(segments): index = 1 for title in section: title = title.replace(u'"', u'') count = util.getGematria(title) if count != index: print headers[sec_number-1] print util.numToHeb(index) index = count index += 1
def file_to_ja_g(depth, infile, expressions, cleaner,grab_all=False): """ Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged Array in the desired structure (Chapter, verse, etc.) This function is a modulation of the origanal file_to_ja because it deals with gimatria letters so to place the correct chapters and segments in the currect places according to the hebrew letter numbering. Ofcourse it also puts in the padding where needed. (_g stands for Gimatria. :param depth: depth of the JaggedArray. :param infile: Text file to read from :param expressions: A list of regular expressions with which to identify section (chapter) level. Do not include an expression with which to break up the segment levels. :param cleaner: A function that takes a list of strings and returns an array with the text parsed correctly. Should also break up and remove unnecessary tagging data. :param grab_all: If set to true, will grab the lines indicating new sections. :return: A jagged_array with the text properly structured. """ # instantiate ja # structure = reduce(lambda x,y: [x], range(depth-1), []) # ja = JaggedArray(structure) ja = JaggedArray([]) # ensure there is a regex for every level except the lowest if depth - len(expressions) != 1: raise AttributeError('Not enough data to parse. Need {} expressions, ' 'received {}'.format(depth-1, len(expressions))) # compile regexes, instantiate index list regexes, indices = [re.compile(ex) for ex in expressions], [-1]*len(expressions) temp = [] # loop through file for line in infile: # check for matches to the regexes for i, reg in enumerate(regexes): found = reg.search(line) if found: # check that we've hit the first chapter and verse if indices.count(-1) == 0: ja.set_element(indices, cleaner(temp), []) temp = [] if grab_all: temp.append(line) gimt = getGematria(found.group('gim')) if gimt != 0: indices[i] = gimt - 1 else: indices[i] += 1 indices[i+1:] = [-1 if x >= 0 else x for x in indices[i+1:]] break else: if indices.count(-1) == 0: temp.append(line) else: ja.set_element(indices, cleaner(temp), []) return ja
def identify_star_locations(filename): def get_regex(): partial_regexes = [ u'@12([\u05d0-\u05ea]{1,3})', u'@11([\u05d0-\u05ea])', u'@11(\*)' ] names = [u'siman', u'seif', u'star'] my_full_regexes = [ u'(?P<{}>{})'.format(*i) for i in zip(names, partial_regexes) ] return re.compile(u'|'.join(my_full_regexes)) with codecs.open(filename, 'r', 'utf-8') as infile: lines = infile.readlines() siman, seif_index, seif_letter, num_stars = -1, -1, None, 0 star_locations, current_star = [], {} line_regex = get_regex() for line in lines: line_data = line_regex.search(line) if line_data is None: continue elif line_data.lastgroup == u'star': num_stars += 1 current_star = { u'siman_num': siman, u'preceding_index': seif_index, u'preceding_letter': seif_letter } else: if line_data.lastgroup == u'seif': seif_index += 1 seif_letter = line_data.group(line_data.lastindex + 1) elif line_data.lastgroup == u'siman': siman = getGematria(line_data.group(line_data.lastindex + 1)) seif_index = -1 seif_letter = None else: raise LookupError(u"Expecting seif or siman, got {}".format( line_data.lastgroup)) if num_stars >= 1: current_star[u'star_count'] = num_stars current_star[u'following_index'] = seif_index current_star[u'following_letter'] = seif_letter star_locations.append(current_star) num_stars = 0 else: if num_stars >= 1: current_star[u'star_count'] = num_stars current_star[u'following_index'] = 0 current_star[u'following_letter'] = None star_locations.append(current_star) return star_locations
def fill_in_missing_sections_and_update_last(each_line, base_list, this_regex, filler, last_index): match_object = this_regex.search(each_line) current_index = util.getGematria(match_object.group(1)) diff = current_index - last_index while diff > 1: base_list.append(filler) diff -= 1 return current_index
def fill_in_missing_sections_and_updated_last(each_line, base_list, this_regex, filler, last_index): match_object = this_regex.search(each_line) current_index = util.getGematria(match_object.group(1)) diff = current_index - last_index while diff > 1: base_list.append(filler) diff -= 1 return current_index
def seferHamitzvot_from_rasag_comm(rasagCsvName, with_orig = False): # ind_rasag_comm = library.get_index("Commentary on Sefer Hamitzvot of Rasag") segments = Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Positive_Commandments').all_segment_refs() segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Negative_Commandments').all_segment_refs()) segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Laws_of_the_Courts').all_segment_refs()) segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Communal_Laws').all_segment_refs()) cnt = {"Rasag":0, "Sefer HaMitzvot":0, "Semag":0, "Semak":0} dict_list = [] for seg in segments: # sfHmtzvot = re.search(u'(?:ספר המצו?ות|סה"מ).{1,4}(עשין|לאוין|עשה|לא תעשה).{0,20}', seg.text('he').text) sfHmtzvot = re.search(u'(?:ספר המצוות|סה"מ)\s{1,4}\((.*?)\)', seg.text('he').text) smg = re.search(u'סמ"ג \((.*?)\)', seg.text('he').text) smk = re.search(u'סמ"ק (\(.*?\))', seg.text('he').text) row_dict = {} row_orig = {} if sfHmtzvot: # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1) # row_orig["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1) kind, simanim = rasag_exctractor(sfHmtzvot.group(1)) # row_dict["Sefer HaMitzvot"] = ['Sefer HaMitzvot, {}.{}'.format(kind, siman) for siman in simanim] if kind: row_dict["Sefer HaMitzvot"] = 'Sefer HaMitzvot, {}.{}'.format(kind, simanim[0]) else: print "no kind", sfHmtzvot.group(1) row_orig["Sefer HaMitzvot"] = sfHmtzvot.group() cnt["Sefer HaMitzvot"] += 1 if smg: # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1) kind, simanim = rasag_exctractor(smg.group(1)) # row_dict["Semag"] = ['Sefer Mitzvot Gadol, {}.{}'.format(kind, siman) for siman in simanim] if kind: row_dict["Semag"] = 'Sefer Mitzvot Gadol, {}.{}'.format(kind, simanim[0]) else: print "no kind", smg.group(1) row_orig["Semag"] = smg.group() cnt["Semag"] += 1 if smk: # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1) # simanim = siman_smk_exctractor(smk.group(1)) smki = re.search(u"ב?סי'\s+(.*?)(?:\s*\))", smk.group(1)) if smki: siman = getGematria(smki.group(1)) row_dict["Semak"] = "Sefer Mitzvot Katan.{}".format(siman) row_orig["Semak"] = smk.group() cnt["Semak"] += 1 else: print u'***siman***' + smk.group() if row_dict: cnt["Rasag"] += 1 row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1) row_orig["Rasag"] = seg.normal() if with_orig: dict_list.append(row_orig) dict_list.append(row_dict) toCsv(rasagCsvName, ["Rasag", "Sefer HaMitzvot", "Semag", "Semak"], dict_list) print cnt
def identify_errors(siman, pattern, sequence_code): errors = [] matches = list(re.finditer(pattern, siman)) previous = 0 jump_ahead = False for i, match in enumerate(matches): if jump_ahead: jump_ahead = False continue try: current, following = getGematria(match.group(1)), getGematria( matches[i + 1].group(1)) except IndexError: break if current - previous == 0: # double tag previous = current continue elif current - previous == 2 and following - current == 1: # missing tag error = { u'type': u'missing', u'from_sequence': sequence_code, u'value': current - 1, } if i == 0: error[u'range'] = (0, match.start()) else: error[u'range'] = (matches[i - 1].end(), match.start()) errors.append(error) previous = current continue elif following - previous == 1 and current - previous != 1: # out of place errors.append({ u'type': u'out_of_place', u'from_sequence': sequence_code, u'value': current, u'tag': match.group(), u'loc': match.start() }) previous = following jump_ahead = True else: previous = current return errors
def check_vav(st): if not st: return False if st[0] == u'ו': if is_hebrew_number(st[1:]): return getGematria(st[1:]) else: return False return False
def scrape_wiki(): url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A" page = requests.get(url) soup_body = BeautifulSoup(page.text, "lxml") tables = soup_body.select(".mw-parser-output > table") pairs = [] links = [] for table in tables: table_tr = table.select("tr") for col in table_tr: pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip())) for pair in pairs: if re.search(u'ספר|מספר', pair[0]): continue neg_pos = u"Negative Mitzvot" if re.search( u"לאו", pair[1]) else u'Positive Mitzvot' rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip()) chinukh = getGematria(pair[0]) print chinukh, rambam chinukh_simanlen = len( Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs()) print neg_pos link = ({ "refs": [ u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen), u'Mishneh Torah, {}.{}'.format(neg_pos, rambam) ], "type": "Sifrei Mitzvot", "auto": True, "generated_by": "chinukh_rambam_sfm_linker" # _sfm_linker what is this parametor intended to be? }) print link['refs'] links.append(link) return links
def parse(file_name): chapter_number = regex.compile('@00([\u05d0-\u05ea]{1,2})') chapter_index = 1 section, comment = [], [] seven, shorashim, nine = [], [], [] chapter_seven_intro = True with codecs.open(file_name, 'r', 'utf-8') as the_file: for each_line in the_file: if "@00" in each_line: if chapter_index != 7 and chapter_index != 9: section.append(comment) comment = [] elif chapter_index == 7: shorashim.append(comment) seven.append(shorashim) section.append(seven) comment = [] elif chapter_index == 9: nine.append(comment) section.append(nine) comment = [] match_object = chapter_number.search(each_line) chapter_index = util.getGematria(match_object.group(1)) elif chapter_index != 7 and chapter_index != 9: each_line = clean_up(each_line) comment.append(each_line) elif chapter_index == 7: if "@01" in each_line: if chapter_seven_intro: seven.append(comment) comment = [] chapter_seven_intro = False else: shorashim.append(comment) comment = [] else: comment.append(each_line) elif chapter_index == 9: if "@01" in each_line: nine.append(comment) comment = [] else: comment.append(each_line) section.append(comment) return section
def link_hg(hg_ja, hagahot_dict_lst, ja_raph): def link_hg_smk_or_raph(siman, smk_seg, hg, place_smk_hg, base_text): link = ( { "refs": [ u"{} {}:{}".format(base_text, siman, smk_seg), "Haggahot Chadashot on Sefer Mitzvot Katan {}:{}".format(siman, hg), # really should be a ref link to the whole raph ], "type": "commentary", 'inline_reference': { 'data-commentator': 'Haggahot Chadashot on Sefer Mitzvot Katan', 'data-order': place_smk_hg }, "auto": True, "generated_by": "semak_parser" }) return link # linking links = [] smks = [] raphs = [] for dict in hagahot_dict_lst: smks += dict["smk"] raphs += dict["raph"] pts = 0 ptr = 0 link = None for dict in hagahot_dict_lst: # link all the haghot in a siman to the correct Semak segment pts_0 = 0 ptr_0 = 0 sim = getGematria(dict["siman"]) # print sim for j, hgha in enumerate(hg_ja[sim-1]): smk_first = True if ptr < len(raphs) and smks[pts][0] == raphs[ptr][0]: if dict["raph"] and any([re.search(raphs[ptr][0], letter[0]) for letter in dict["raph"]]): smk_first = False if smk_first and re.search(u"@11\({}\)".format(smks[pts][0]), hgha): # pts < len(smks) link = link_hg_smk_or_raph(sim, smks[pts][1], j+1, pts_0+1, "Sefer Mitzvot Katan") pts += 1 pts_0 += 1 elif ptr < len(raphs) and re.search(u"@11\({}\)".format(raphs[ptr][0]), hgha): link = link_hg_smk_or_raph(sim, raphs[ptr][1], j+1, ptr_0+1, 'Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan') ptr += 1 ptr_0 += 1 else: print u"error {}: something with the numbering is wrong...".format(dict["siman"]) if link: links.append(link) return links
def fill_in_missing_sections_and_updated_last(each_line, base_list, this_regex, filler, last_index): match_object = this_regex.search(each_line) string_of_mitzvot = match_object.group(1) string_of_mitzvot = string_of_mitzvot.strip() list_of_mitzvot = string_of_mitzvot.split() current_index = util.getGematria(list_of_mitzvot[0]) diff = current_index - last_index while diff > 1: base_list.append(filler) diff -= 1 return current_index
def identify_star_locations(filename): def get_regex(): partial_regexes = [u'@12([\u05d0-\u05ea]{1,3})', u'@11([\u05d0-\u05ea])', u'@11(\*)'] names = [u'siman', u'seif', u'star'] my_full_regexes = [u'(?P<{}>{})'.format(*i) for i in zip(names, partial_regexes)] return re.compile(u'|'.join(my_full_regexes)) with codecs.open(filename, 'r', 'utf-8') as infile: lines = infile.readlines() siman, seif_index, seif_letter, num_stars = -1, -1, None, 0 star_locations, current_star = [], {} line_regex = get_regex() for line in lines: line_data = line_regex.search(line) if line_data is None: continue elif line_data.lastgroup == u'star': num_stars += 1 current_star = { u'siman_num': siman, u'preceding_index': seif_index, u'preceding_letter': seif_letter } else: if line_data.lastgroup == u'seif': seif_index += 1 seif_letter = line_data.group(line_data.lastindex+1) elif line_data.lastgroup == u'siman': siman = getGematria(line_data.group(line_data.lastindex+1)) seif_index = -1 seif_letter = None else: raise LookupError(u"Expecting seif or siman, got {}".format(line_data.lastgroup)) if num_stars >= 1: current_star[u'star_count'] = num_stars current_star[u'following_index'] = seif_index current_star[u'following_letter'] = seif_letter star_locations.append(current_star) num_stars = 0 else: if num_stars >= 1: current_star[u'star_count'] = num_stars current_star[u'following_index'] = 0 current_star[u'following_letter'] = None star_locations.append(current_star) return star_locations
def rasag_exctractor(text): split = re.split(u"\s", text) simanim = [] kind = None if re.search(u"(:?לאוין|לא תעשה)", split[0]): kind = u'Negative Commandments' elif re.search(u"(:?עשין|עשה)", split[0]): kind = u'Positive Commandments' for word in split[1:]: siman = getGematria(word) simanim.append(siman) return kind, simanim
def xmlify(filename): """ create an xml representation of the text files :param filename: str name of file """ with codecs.open(filename, 'r', 'utf-8') as infile: raw_rambam = infile.read() chap_index = [ getGematria(i.group(1)) for i in re.finditer( ur'@00\u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})', raw_rambam) ] chapters = re.split(ur'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}', raw_rambam)[1:] assert len(chap_index) == len(chapters) soup = BeautifulSoup(u'<root></root>', 'xml') for index, chapter in zip(chap_index, chapters): x_chapter = soup.new_tag('chapter', num=unicode(index)) soup.root.append(x_chapter) v_indices = [ getGematria(i.group(1)) for i in re.finditer(ur'@22([\u05d0-\u05ea]{1,2})', chapter) ] verses = re.split(ur'@22[\u05d0-\u05ea]{1,2}', chapter)[1:] assert len(v_indices) == len(verses) for v_index, verse in zip(v_indices, verses): x_verse = soup.new_tag('verse', num=unicode(v_index)) comments = verse.splitlines() for i, comment in enumerate(comments[1:]): x_comment = soup.new_tag('comment', num=unicode(i + 1)) x_comment.append(comment) x_verse.append(x_comment) x_chapter.append(x_verse) with codecs.open('./xml/{}'.format(filename.replace('.txt', '.xml')), 'w', 'utf-8') as outfile: outfile.write(unicode(soup.prettify()))
def produce_parsed_data(filename): with codecs.open(filename, 'r', 'utf-8') as datafile: parsed = util.file_to_ja(3, datafile, (m_pattern, comment_pattern), nothing) datafile.seek(0) names = util.grab_section_names(m_pattern, datafile, 1) names = [int(util.getGematria(name)) for name in names] comp_text = util.simple_to_complex(names, parsed.array()) parsed = util.convert_dict_to_array(comp_text) return parsed
def _collect_hebrew_segments(self, soup): assert isinstance(soup, BeautifulSoup) he_reg = re.compile(u'^LMH') all_he_ps = soup.find_all('p', attrs={'class': he_reg}) segments = [] started = False chapter_reg = re.compile(ur'''\u05dc\u05d9\u05e7\u05d5\u05d8\u05d9 \u05de\u05d5\u05d4\u05e8[\u05f4"]\u05df\s(\u05ea\u05e0\u05d9\u05e0\u05d0\s)?\u05e1\u05d9\u05de\u05df\s(?P<chapter>[\u05d0-\u05ea"]{1,4})''') for he_p in all_he_ps: if he_p['class'] == u'LMH-styles_LMH-title': if not he_p.string: raise AssertionError chapter_match = chapter_reg.match(he_p.string) if chapter_match: if getGematria(chapter_match.group('chapter')) == self.number: started = True elif started: break elif started: if re.search(u'Rashbam', he_p['class']): continue segments.append(he_p) else: continue # if current segment ends on a Hebrew char, combine with the next segment bad_indices = [] for i, (cur_segment, next_segment) in enumerate(zip(segments, segments[1:])): segment_text = cur_segment.text stripped_text = re.sub(u"[\u05b0-\u05C7]", u'', segment_text) # strip nikkud if re.search(u'[\u05d0-\u05ea]\s*$', stripped_text): # merge this segment into this one bad_indices.append(i) for child in cur_segment.find_all(True): child.unwrap() cur_segment.string = u'{} '.format(u' '.join(cur_segment.contents)) next_segment.insert(0, cur_segment) cur_segment.unwrap() elif not segment_text: bad_indices.append(i) for i in reversed(bad_indices): segments.pop(i) assert len(segments) > 0 self.hebrew_segments = [bleach.clean(s, tags=[], attributes={}, strip=True) for s in segments]
def test_expression(pattern): """ test a regular expression object to see if how well it grabs all "springs" and "rivers" :param pattern: regular expression string :return: List of missed "rivers", expressed as a tuple: (spring, river) """ regex = re.compile(pattern) split = get_text().splitlines() matches = filter(None, [regex.search(match) for match in split]) issues = [] print u'last_match: {}'.format(matches[-1].group()) expected_spring, expected_river = 1, 1 for match in matches: spring, river = getGematria(match.group(1)), getGematria(match.group(3)) if spring > expected_spring: expected_river = 1 expected_spring = spring if river > expected_river: while river > expected_river: issues.append((expected_spring, expected_river)) expected_river += 1 expected_river += 1 return issues
def check_chapters(): with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch: test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})') index = 1 for header in test.grab_each_header(capture_group=1): header = header.replace(u'"', u'') count = util.getGematria(header) if count != index: print util.numToHeb(index) index = count index += 1
def produce_parsed_data(filename): with codecs.open(filename, 'r', 'utf-8') as datafile: parsed = util.file_to_ja([[[]]], datafile, (m_pattern, comment_pattern), nothing) datafile.seek(0) names = util.grab_section_names(m_pattern, datafile, 1) names = [int(util.getGematria(name)) for name in names] comp_text = util.simple_to_complex(names, parsed.array()) parsed = util.convert_dict_to_array(comp_text) return parsed
def fix_file(filepath, start_siman, test_mode=False): output_list = [] with codecs.open(filepath, 'r', 'utf-8') as fp: lines = fp.readlines() counter = 0 for line in lines: match = re.match(u'^@11([\u05d0-\u05ea]{1,3})$', line) if match and getGematria(match.group(1)) == 1: output_list.append(u'@00{}\n'.format( numToHeb(counter + start_siman))) counter += 1 output_list.append(line) if test_mode: filepath = re.sub(ur'\.txt$', u'_test.txt', filepath) with codecs.open(filepath, 'w', 'utf-8') as fp: fp.writelines(output_list)
def align_boaz_chapters(source_file, simple_array): """ Boaz does not guarantee text for every chapter. Using the util library, this method will pad the parsed text with empty sections as necessary to accurately represent the data. :param source_file: File from which to derive chapter numbers :param simple_array: A "naive" parse of the data structured as a nested list. :return: Nested array, with proper padding to account for empty chapters. """ # grab each chapter number from the source file chapters = [ util.getGematria(n) for n in util.grab_section_names( u'@00פרק ([\u05d0-\u05ea]{1,2})', source_file, 1) ] as_dict = util.simple_to_complex(chapters, simple_array) return util.convert_dict_to_array(as_dict)
def parser(name): with codecs.open('{}.txt'.format(name), 'r', 'utf-8') as infile: lines = infile.readlines() parsed_text = JaggedArray([[[]]]) links = [] chapter, mishnah, comment = -1, -1, -1 for line in lines: if re.match(ur'@00\u05e4\u05e8\u05e7', line) is not None: chapter += 1 comment = -1 continue elif re.match(ur'@22', line) is not None: mishnah = getGematria(re.match(ur'@22([\u05d0-\u05ea]{1,2})', line).group(1)) - 1 comment = -1 continue
def parse_and_post(file_name): mishna_number_regex = regex.compile(u'([\u05d0-\u05ea]{1,3})') rb_yonah_on_avot, perek_level_list, mishna_level_list = [], [], [] new_perek, first_perek = True, True last_mishna = 0 with codecs.open(file_name, 'r', 'utf-8') as the_file: for each_line in the_file: if "@00" in each_line: if not first_perek: perek_level_list.append(mishna_level_list) rb_yonah_on_avot.append(perek_level_list) perek_level_list, mishna_level_list = [], [] new_perek = True else: first_perek = False elif "@22" in each_line: if not new_perek: perek_level_list.append(mishna_level_list) mishna_level_list = [] match_object = mishna_number_regex.search(each_line) mishna_number = util.getGematria(match_object.group(1)) diff = mishna_number - last_mishna while diff > 1: perek_level_list.append([]) diff -= 1 last_mishna = mishna_number else: new_perek = False last_mishna = 1 else: divided_string = each_line.split(u'~') for line in divided_string: line = line.strip() if line: line = clean_up_string(line) mishna_level_list.append(line) rb_yonah_on_avot.append(perek_level_list) post_the_text(rb_yonah_on_avot) return rb_yonah_on_avot
def parse(): mishna_number_regex = regex.compile(u'([\u05d0-\u05ea]{1,3})') gra_on_avot, perek_level_list, mishna_level_list = [], [], [] new_perek, first_perek = True, True last_mishna = 0 with codecs.open('gra_on_avot.txt', 'r', 'utf-8') as the_file: for each_line in the_file: if "@00" in each_line: if not first_perek: perek_level_list.append(mishna_level_list) gra_on_avot.append(perek_level_list) perek_level_list, mishna_level_list = [], [] new_perek = True else: first_perek = False elif "@22" in each_line: if not new_perek: perek_level_list.append(mishna_level_list) mishna_level_list = [] match_object = mishna_number_regex.search(each_line) mishna_number = util.getGematria(match_object.group(1)) diff = mishna_number - last_mishna while diff > 1: perek_level_list.append([]) diff -= 1 last_mishna = mishna_number else: new_perek = False last_mishna = 1 else: each_line = clean_up_string(each_line) mishna_level_list.append(each_line) gra_on_avot.append(perek_level_list) return gra_on_avot
def regs_devide(lines, regs, eof=None): reg = regs[0] ja = [] letter = [] siman = [] for line in lines: comb_letter = ' '.join(letter) if re.search(reg, line) or (eof and re.search(eof, line)): siman.append(comb_letter) letter = [] if re.search(reg, line): gim = getGematria(re.search(reg, line).group(1)) if gim == 1 or (eof and re.search(eof, line)): ja.append(siman) if siman == ['']: ja.pop() siman = [] letter.append(line) return ja
def _set_he_section_transitions(self): transition_list = [] current_segment = 1 for seg_num, segment in enumerate(self._hebrew_segments): match = re.match(u'^([\u05d0-\u05d8]|[\u05d9-\u05dc][\u05d0-\u05d8]?|\u05d8[\u05d5\u05d6])\.\s', segment) if not match: continue next_segment = getGematria(match.group(1)) if next_segment == 1: pass elif next_segment - current_segment != 1: print "Bad hebrew section transition found in chapter {}".format(self.number) raise AssertionError else: transition_list.append(seg_num) current_segment = next_segment self._he_section_transitions = tuple(transition_list)