def wrapper(items): if callable(start_method): start_method() index_mapping = {} for item_num, item in enumerate(items): value = func(item) if value or value is 0: if value in index_mapping: raise ClashError index_mapping[value] = item_num values = sorted(index_mapping.keys()) list_mapping = {} current, next_ = values[:-1], values[1:] for start, end in zip(current, next_): if include_matches: list_mapping[start] = items[index_mapping[start]:index_mapping[end]] else: list_mapping[start] = items[index_mapping[start]+1:index_mapping[end]] try: last_value, last_index = values[-1], index_mapping[values[-1]] except IndexError: return [] if not include_matches: last_index += 1 list_mapping[last_value] = items[last_index:] if one_indexed: return convert_dict_to_array(list_mapping, list)[1:] else: return convert_dict_to_array(list_mapping, list)
def build_structure(self): book = {} for line in self.important_text: chapter, verse = line['chapter'], line['verse'] if chapter not in book.keys(): book[chapter] = {} book[chapter][verse] = self.structure_comments(line['text']) book = util.convert_dict_to_array(book) for index, section in enumerate(book): book[index] = util.convert_dict_to_array(section) return book
def build_structure(self): book = {} for line in self.important_text: chapter, verse = line['chapter'], line['verse'] if chapter not in book.keys(): book[chapter] ={} book[chapter][verse] = self.structure_comments(line['text']) book = util.convert_dict_to_array(book) for index, section in enumerate(book): book[index] = util.convert_dict_to_array(section) return book
def test_convert_dict_to_array(): my_dict = { 1: 'foo', 3: 'bar', 5: 'baz' } assert util.convert_dict_to_array(my_dict, str) == ['', 'foo', '', 'bar', '', 'baz']
def parse(cls, lines): seif_mapping = {} current_seif = -1 for line_num, line in enumerate(lines): seif_mark = re.search(u'^<b>(\d+)\.?', line['English']) if seif_mark: seif_value = int(seif_mark.group(1)) if seif_value in seif_mapping: if seif_value == current_seif: continue else: raise ClashError seif_mapping[seif_value] = line_num current_seif = seif_value if 1 not in seif_mapping: seif_mapping[1] = 0 seifim = sorted(seif_mapping.keys()) list_mapping = {} for seif, next_seif in zip(seifim[:-1], seifim[1:]): list_mapping[seif] = lines[ seif_mapping[seif]:seif_mapping[next_seif]] last_seif = seifim[-1] list_mapping[last_seif] = lines[seif_mapping[last_seif]:] return [cls(l) for l in convert_dict_to_array(list_mapping, list)[1:]]
def parse(cls, lines): seif_mapping = {} current_seif = -1 for line_num, line in enumerate(lines): seif_mark = re.search(u'^<b>(\d+)\.?', line['English']) if seif_mark: seif_value = int(seif_mark.group(1)) if seif_value in seif_mapping: if seif_value == current_seif: continue else: raise ClashError seif_mapping[seif_value] = line_num current_seif = seif_value if 1 not in seif_mapping: seif_mapping[1] = 0 seifim = sorted(seif_mapping.keys()) list_mapping = {} for seif, next_seif in zip(seifim[:-1], seifim[1:]): list_mapping[seif] = lines[seif_mapping[seif]:seif_mapping[next_seif]] last_seif = seifim[-1] list_mapping[last_seif] = lines[seif_mapping[last_seif]:] return [cls(l) for l in convert_dict_to_array(list_mapping, list)[1:]]
def parse(file): text = {} file = open(file) perek = 1 text[1] = {} mishnah = 0 for line in file: line = line.decode('utf-8') if line.find("@00") == 0: continue poss_mishnah = getMishnah(line) if poss_mishnah: poss_mishnah = ChetAndHey(poss_mishnah, mishnah) if poss_mishnah not in text[perek]: text[perek][poss_mishnah] = [] mishnah = poss_mishnah else: assert poss_mishnah == 1 mishnah = 1 perek += 1 text[perek] = {} text[perek][1] = [] line = getLine(line) if line: if line.find("@22") == 0: line = " ".join(line.split(" ")[1:]) if line.find("@58") >= 0 or line.find("@78") >= 0: matches = re.findall("@58\S+|@78\S+", line) for match in matches: line = line.replace(match, "") line = line.replace("@11", "<b>").replace("@33", "</b>") line = line.replace("@66", "<small>(").replace("@77", ")</small>") line = removeAllTags(line) lines = line.split("<b>")[1:] for each_line in lines: text[perek][mishnah].append("<b>" + each_line) prev_line = line for perek in text: text[perek] = convert_dict_to_array(text[perek]) text = convert_dict_to_array(text) return text
def produce_parsed_data(filename): with codecs.open(filename, 'r', 'utf-8') as datafile: parsed = util.file_to_ja(3, datafile, (m_pattern, comment_pattern), nothing) datafile.seek(0) names = util.grab_section_names(m_pattern, datafile, 1) names = [int(util.getGematria(name)) for name in names] comp_text = util.simple_to_complex(names, parsed.array()) parsed = util.convert_dict_to_array(comp_text) return parsed
def break_into_simanim(text): # make a list with all the siman letters: simanim_list = re.findall(r'@01\s*\u05e1\u05d9\u05de\u05df\s*([\u05d0-\u05ea]+)', text) # make a list with all the siman numbers gematria_list = make_gematria_list(simanim_list) # split the string of the entire text into a list of simanim er_simanim = re.split(r'@01\s*\u05e1\u05d9\u05de\u05df\s*[\u05d0-\u05ea]+\s*@02', text) #er_simanim = re.split(r'@01', text) er_simanim.pop(0) # make a dict with the keys being the numbers of simanim and the value being the string of that siman simanim_dict = dict(zip(gematria_list, er_simanim)) # convert our dict with each siman having a corresponding key into a list of simanim which will now be padded er_simanim = convert_dict_to_array(simanim_dict) return er_simanim
def wrapper(items): if callable(start_method): start_method() index_mapping = {} for item_num, item in enumerate(items): value = func(item) if value or value is 0: if value in index_mapping: raise ClashError index_mapping[value] = item_num values = sorted(index_mapping.keys()) list_mapping = {} current, next_ = values[:-1], values[1:] for start, end in zip(current, next_): if include_matches: list_mapping[start] = items[ index_mapping[start]:index_mapping[end]] else: list_mapping[start] = items[index_mapping[start] + 1:index_mapping[end]] try: last_value, last_index = values[-1], index_mapping[values[-1]] except IndexError: return [] if not include_matches: last_index += 1 list_mapping[last_value] = items[last_index:] if one_indexed: return convert_dict_to_array(list_mapping, list)[1:] else: return convert_dict_to_array(list_mapping, list)
def produce_parsed_data(filename): with codecs.open(filename, 'r', 'utf-8') as datafile: parsed = util.file_to_ja([[[]]], datafile, (m_pattern, comment_pattern), nothing) datafile.seek(0) names = util.grab_section_names(m_pattern, datafile, 1) names = [int(util.getGematria(name)) for name in names] comp_text = util.simple_to_complex(names, parsed.array()) parsed = util.convert_dict_to_array(comp_text) return parsed
def align_boaz_chapters(source_file, simple_array): """ Boaz does not guarantee text for every chapter. Using the util library, this method will pad the parsed text with empty sections as necessary to accurately represent the data. :param source_file: File from which to derive chapter numbers :param simple_array: A "naive" parse of the data structured as a nested list. :return: Nested array, with proper padding to account for empty chapters. """ # grab each chapter number from the source file chapters = [ util.getGematria(n) for n in util.grab_section_names( u'@00פרק ([\u05d0-\u05ea]{1,2})', source_file, 1) ] as_dict = util.simple_to_complex(chapters, simple_array) return util.convert_dict_to_array(as_dict)
def break_into_seifim(simanim): for index, siman in enumerate(simanim): if siman: # make a list with all the seif letters: seifim_list = re.findall(r'@07\s*\u05e1\u05e2\u05d9\u05e3\s*([\u05d0-\u05ea]+)', siman) # make a list with all the seif numbers: gematria_list = make_gematria_list(seifim_list) for i, gematria in enumerate(gematria_list): gematria_list[i]+=1 simanim[index] = re.split(r'@07', siman) if not simanim[index][0]: simanim[index].pop(0) else: gematria_list.insert(0, 0) # if theres only one seif in the siman, no indication and gematria list will just be one 0. change it to 1 if len(gematria_list) == 1 and gematria_list[0] == 0: gematria_list[0] += 1 # make a dict with the keys being the numbers of seifim and the value being the string of that seif seifim_dict = dict(zip(gematria_list, simanim[index])) # convert our dict with each seif having a corresponding key into a list of seifim which will now be padded simanim[index] = convert_dict_to_array(seifim_dict) return simanim
def _parse(self): book = self._important_lines for chapter in book.keys(): book[chapter] = util.convert_dict_to_array(book[chapter]) book = util.convert_dict_to_array(book) return book
def parse(html_page, csv_page): def end_of_intro(html_fragment): soup = html_fragment if soup["class"][0] == u"_-מספר-עמוד": return True return False def contains_loc(html_fragment): soup = html_fragment if soup["class"][0] == u"_-מספר-עמוד" and re.match(u'פרק', soup.text): return True return False def contains_range(html_fragment): soup = html_fragment if re.search(u'משניות', soup.text): return True return False def contains_headline(html_fragment): soup = html_fragment if soup["class"][0] == u"_-כותרת-ירוקה": return True return False def contains_mishna(html_fragment): soup = html_fragment if soup.find('span', {"class": u"_-פירוש-בירוק"}): return True return False def contains_commentary(html_fragment): soup = html_fragment if soup["class"][0] == u"_טקסט-רץ": return True return False def get_chapter(html_fragment): soup = html_fragment return unicode(gematria(re.search(u'פרק (.)', soup.text).group(1))) def get_mishna(html_fragment): soup = html_fragment if contains_range(soup): return unicode(gematria(re.search(u'משניות (.*)-.*', soup.text).group(1))) return unicode(gematria(re.search(u'משנה (.*)', soup.text).group(1))) def get_loc(html_fragment): soup = html_fragment location = { 'map': get_chapter(soup) + u':' + get_mishna(soup), 'ch': get_chapter(soup), 'mishna': get_mishna(soup) } return location def convert_to_vilna(vilna_string): location = { 'map': vilna_string, 'ch': vilna_string[:1], 'mishna': re.search(u':(.*)', vilna_string).group(1) } return location intro, first_title, only_commentary = True, True, True links, intro_text, chapters, mishnayot, segments = [], [], {}, {}, [] cur_loc = {'map': u'1:1', 'ch': u'1', 'mishna': u'1'} infile = io.open(csv_page, 'r') reader = csv.reader(infile) mishna_map = dict((row[0], row[1]) for row in reader) infile.close() infile = io.open(html_page, 'r') soup = BeautifulSoup(infile, 'html5lib') infile.close() for p in soup.find_all('p'): if intro: if end_of_intro(p): intro = False else: intro_text.append(p.text.strip()) continue if contains_loc(p): new_loc = get_loc(p) # reconcile clashes between shinan / vilna strutures if mishna_map[new_loc['map']] != new_loc['map']: new_loc = convert_to_vilna(mishna_map[new_loc['map']]) # store previous mishna if cur_loc['mishna'] != new_loc['mishna'] and segments: if only_commentary: pirkei_ref = u"Pirkei Avot " + cur_loc['map'] shinan_ref = u"A New Israeli Commentary on Pirkei Avot {}:1-{}".format(cur_loc['map'], unicode(len(segments))) links.append({ 'refs': [pirkei_ref, shinan_ref], 'type': 'commentary', 'auto': True, 'generated_by': 'Shinan on Avot parser' }) mishnayot[int(cur_loc['mishna'])] = segments segments = [] # store previous chapter if cur_loc['ch'] != new_loc['ch'] and mishnayot: chapters[int(cur_loc['ch'])] = mishnayot mishnayot = {} cur_loc = new_loc first_title = True only_commentary = True if segments: start = unicode(len(segments)+1) else: start = u'1' if contains_headline(p): if first_title: pirkei_ref = u"Pirkei Avot " + cur_loc['map'] shinan_ref = u"A New Israeli Commentary on Pirkei Avot {}:{}-{}".format(cur_loc['map'], start, unicode(len(segments))) links.append({ 'refs': [pirkei_ref, shinan_ref], 'type': 'commentary', 'auto': True, 'generated_by': 'Shinan on Avot parser' }) first_title = False only_commentary = False chunk = p.text.replace(p.text, u"<b>" + p.text.strip() + "</b>") segments.append(chunk) elif contains_mishna(p): for child in p.children: if isinstance(child, NavigableString): if child == u' ' or child == u'.': if not isinstance(child.previous_sibling, NavigableString): child.previous_sibling.string += child else: child.previous_sibling += child chunk = u'' for child in p.children: if isinstance(child, NavigableString): chunk += unicode(child) elif child["class"][0] == u"CharOverride-25": chunk += child.text elif child["class"][0] == u"_-פירוש-בירוק": if child.text == u' ' or child.text == u'.': continue if child.previous_sibling is None: if isinstance(child.next_sibling, NavigableString): chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + u'.</b> ') elif child.next_sibling["class"][0] == u'_-': chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + u'.</b> ') elif child.next_sibling["class"][0] == u'_-פירוש-בירוק': chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + child.next_sibling.text.strip() + u'.</b> ') elif isinstance(child.previous_sibling, NavigableString): if not isinstance(child.previous_sibling.previous_sibling, NavigableString): if child.previous_sibling.previous_sibling is not None: if child.previous_sibling.previous_sibling["class"][0] == u'_-פירוש-בירוק': continue segments.append(chunk) if isinstance(child.next_sibling, NavigableString): if child.next_sibling != u" ": print(child) elif child.next_sibling.next_sibling["class"][0] == u'_-פירוש-בירוק': chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + child.next_sibling + child.next_sibling.next_sibling.text.strip() + u'.</b> ') elif child.next_sibling.next_sibling["class"][0] == u'_-': chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + u'.</b> ') elif child.next_sibling["class"][0] == u'_-': chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + u'.</b> ') elif child.next_sibling["class"][0] == u'_-פירוש-בירוק': chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + child.next_sibling.text.strip() + u'.</b> ') elif child.previous_sibling["class"][0] == u'_-פירוש-בירוק': continue elif child.previous_sibling["class"][0] == u"CharOverride-25": segments.append(chunk) if child.next_sibling["class"][0] == u'_-': chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + u'.</b> ') elif child.next_sibling["class"][0] == u'_-פירוש-בירוק': chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + child.next_sibling.text.strip() + u'.</b> ') else: segments.append(chunk) elif contains_commentary(p): chunk = p.text.strip() segments.append(chunk) else: pirkei_ref = u"Pirkei Avot " + cur_loc['map'] shinan_ref = u"A New Israeli Commentary on Pirkei Avot {}:1-{}".format(cur_loc['map'], unicode(len(segments))) links.append({ 'refs': [pirkei_ref, shinan_ref], 'type': 'commentary', 'auto': True, 'generated_by': 'Shinan on Avot parser' }) mishnayot[int(cur_loc['mishna'])] = segments chapters[int(cur_loc['ch'])] = mishnayot for chapter in chapters.keys(): chapters[chapter] = util.convert_dict_to_array(chapters[chapter]) chapters = util.convert_dict_to_array(chapters) output = { "intro": intro_text, "content": chapters, "links": links } return output