예제 #1
0
    def wrapper(items):
        if callable(start_method):
            start_method()
        index_mapping = {}
        for item_num, item in enumerate(items):
            value = func(item)
            if value or value is 0:
                if value in index_mapping:
                    raise ClashError

                index_mapping[value] = item_num

        values = sorted(index_mapping.keys())
        list_mapping = {}

        current, next_ = values[:-1], values[1:]
        for start, end in zip(current, next_):
            if include_matches:
                list_mapping[start] = items[index_mapping[start]:index_mapping[end]]
            else:
                list_mapping[start] = items[index_mapping[start]+1:index_mapping[end]]

        try:
            last_value, last_index = values[-1], index_mapping[values[-1]]
        except IndexError:
            return []
        if not include_matches:
            last_index += 1
        list_mapping[last_value] = items[last_index:]

        if one_indexed:
            return convert_dict_to_array(list_mapping, list)[1:]
        else:
            return convert_dict_to_array(list_mapping, list)
예제 #2
0
    def build_structure(self):

        book = {}
        for line in self.important_text:
            chapter, verse = line['chapter'], line['verse']

            if chapter not in book.keys():
                book[chapter] = {}

            book[chapter][verse] = self.structure_comments(line['text'])

        book = util.convert_dict_to_array(book)
        for index, section in enumerate(book):
            book[index] = util.convert_dict_to_array(section)
        return book
    def build_structure(self):

        book = {}
        for line in self.important_text:
            chapter, verse = line['chapter'], line['verse']

            if chapter not in book.keys():
                book[chapter] ={}

            book[chapter][verse] = self.structure_comments(line['text'])

        book = util.convert_dict_to_array(book)
        for index, section in enumerate(book):
            book[index] = util.convert_dict_to_array(section)
        return book
예제 #4
0
def test_convert_dict_to_array():
    my_dict = {
        1: 'foo',
        3: 'bar',
        5: 'baz'
    }
    assert util.convert_dict_to_array(my_dict, str) == ['', 'foo', '', 'bar', '', 'baz']
예제 #5
0
    def parse(cls, lines):
        seif_mapping = {}
        current_seif = -1

        for line_num, line in enumerate(lines):
            seif_mark = re.search(u'^<b>(\d+)\.?', line['English'])
            if seif_mark:
                seif_value = int(seif_mark.group(1))
                if seif_value in seif_mapping:
                    if seif_value == current_seif:
                        continue
                    else:
                        raise ClashError

                seif_mapping[seif_value] = line_num
                current_seif = seif_value
        if 1 not in seif_mapping:
            seif_mapping[1] = 0

        seifim = sorted(seif_mapping.keys())
        list_mapping = {}
        for seif, next_seif in zip(seifim[:-1], seifim[1:]):
            list_mapping[seif] = lines[
                seif_mapping[seif]:seif_mapping[next_seif]]
        last_seif = seifim[-1]
        list_mapping[last_seif] = lines[seif_mapping[last_seif]:]
        return [cls(l) for l in convert_dict_to_array(list_mapping, list)[1:]]
예제 #6
0
    def parse(cls, lines):
        seif_mapping = {}
        current_seif = -1

        for line_num, line in enumerate(lines):
            seif_mark = re.search(u'^<b>(\d+)\.?', line['English'])
            if seif_mark:
                seif_value = int(seif_mark.group(1))
                if seif_value in seif_mapping:
                    if seif_value == current_seif:
                        continue
                    else:
                        raise ClashError

                seif_mapping[seif_value] = line_num
                current_seif = seif_value
        if 1 not in seif_mapping:
            seif_mapping[1] = 0

        seifim = sorted(seif_mapping.keys())
        list_mapping = {}
        for seif, next_seif in zip(seifim[:-1], seifim[1:]):
            list_mapping[seif] = lines[seif_mapping[seif]:seif_mapping[next_seif]]
        last_seif = seifim[-1]
        list_mapping[last_seif] = lines[seif_mapping[last_seif]:]
        return [cls(l) for l in convert_dict_to_array(list_mapping, list)[1:]]
예제 #7
0
def parse(file):
    text = {}
    file = open(file)
    perek = 1
    text[1] = {}
    mishnah = 0
    for line in file:
        line = line.decode('utf-8')
        if line.find("@00") == 0:
            continue
        poss_mishnah = getMishnah(line)
        if poss_mishnah:
            poss_mishnah = ChetAndHey(poss_mishnah, mishnah)
            if poss_mishnah not in text[perek]:
                text[perek][poss_mishnah] = []
                mishnah = poss_mishnah
            else:
                assert poss_mishnah == 1
                mishnah = 1
                perek += 1
                text[perek] = {}
                text[perek][1] = []

        line = getLine(line)
        if line:
            if line.find("@22") == 0:
                line = " ".join(line.split(" ")[1:])
            if line.find("@58") >= 0 or line.find("@78") >= 0:
                matches = re.findall("@58\S+|@78\S+", line)
                for match in matches:
                    line = line.replace(match, "")
            line = line.replace("@11", "<b>").replace("@33", "</b>")
            line = line.replace("@66", "<small>(").replace("@77", ")</small>")
            line = removeAllTags(line)
            lines = line.split("<b>")[1:]
            for each_line in lines:
                text[perek][mishnah].append("<b>" + each_line)
        prev_line = line

    for perek in text:
        text[perek] = convert_dict_to_array(text[perek])
    text = convert_dict_to_array(text)
    return text
예제 #8
0
def produce_parsed_data(filename):

    with codecs.open(filename, 'r', 'utf-8') as datafile:
        parsed = util.file_to_ja(3, datafile, (m_pattern, comment_pattern), nothing)

        datafile.seek(0)

        names = util.grab_section_names(m_pattern, datafile, 1)
        names = [int(util.getGematria(name)) for name in names]

    comp_text = util.simple_to_complex(names, parsed.array())
    parsed = util.convert_dict_to_array(comp_text)

    return parsed
예제 #9
0
def break_into_simanim(text):
    # make a list with all the siman letters:
    simanim_list = re.findall(r'@01\s*\u05e1\u05d9\u05de\u05df\s*([\u05d0-\u05ea]+)', text)
    # make a list with all the siman numbers
    gematria_list = make_gematria_list(simanim_list)
    # split the string of the entire text into a list of simanim
    er_simanim = re.split(r'@01\s*\u05e1\u05d9\u05de\u05df\s*[\u05d0-\u05ea]+\s*@02', text)
    #er_simanim = re.split(r'@01', text)
    er_simanim.pop(0)
    # make a dict with the keys being the numbers of simanim and the value being the string of that siman
    simanim_dict = dict(zip(gematria_list, er_simanim))
    # convert our dict with each siman having a corresponding key into a list of simanim which will now be padded
    er_simanim = convert_dict_to_array(simanim_dict)
    return er_simanim
예제 #10
0
    def wrapper(items):
        if callable(start_method):
            start_method()
        index_mapping = {}
        for item_num, item in enumerate(items):
            value = func(item)
            if value or value is 0:
                if value in index_mapping:
                    raise ClashError

                index_mapping[value] = item_num

        values = sorted(index_mapping.keys())
        list_mapping = {}

        current, next_ = values[:-1], values[1:]
        for start, end in zip(current, next_):
            if include_matches:
                list_mapping[start] = items[
                    index_mapping[start]:index_mapping[end]]
            else:
                list_mapping[start] = items[index_mapping[start] +
                                            1:index_mapping[end]]

        try:
            last_value, last_index = values[-1], index_mapping[values[-1]]
        except IndexError:
            return []
        if not include_matches:
            last_index += 1
        list_mapping[last_value] = items[last_index:]

        if one_indexed:
            return convert_dict_to_array(list_mapping, list)[1:]
        else:
            return convert_dict_to_array(list_mapping, list)
예제 #11
0
def produce_parsed_data(filename):

    with codecs.open(filename, 'r', 'utf-8') as datafile:
        parsed = util.file_to_ja([[[]]], datafile,
                                 (m_pattern, comment_pattern), nothing)

        datafile.seek(0)

        names = util.grab_section_names(m_pattern, datafile, 1)
        names = [int(util.getGematria(name)) for name in names]

    comp_text = util.simple_to_complex(names, parsed.array())
    parsed = util.convert_dict_to_array(comp_text)

    return parsed
예제 #12
0
def align_boaz_chapters(source_file, simple_array):
    """
    Boaz does not guarantee text for every chapter. Using the util library, this method will pad the parsed text with
     empty sections as necessary to accurately represent the data.
    :param source_file: File from which to derive chapter numbers
    :param simple_array: A "naive" parse of the data structured as a nested list.
    :return: Nested array, with proper padding to account for empty chapters.
    """

    # grab each chapter number from the source file
    chapters = [
        util.getGematria(n) for n in util.grab_section_names(
            u'@00פרק ([\u05d0-\u05ea]{1,2})', source_file, 1)
    ]
    as_dict = util.simple_to_complex(chapters, simple_array)
    return util.convert_dict_to_array(as_dict)
예제 #13
0
def break_into_seifim(simanim):
    for index, siman in enumerate(simanim):
        if siman:
            # make a list with all the seif letters:
            seifim_list = re.findall(r'@07\s*\u05e1\u05e2\u05d9\u05e3\s*([\u05d0-\u05ea]+)', siman)
            # make a list with all the seif numbers:
            gematria_list = make_gematria_list(seifim_list)
            for i, gematria in enumerate(gematria_list):
                gematria_list[i]+=1
            simanim[index] = re.split(r'@07', siman)
            if not simanim[index][0]:
                simanim[index].pop(0)
            else:
                gematria_list.insert(0, 0)
            # if theres only one seif in the siman, no indication and gematria list will just be one 0. change it to 1
            if len(gematria_list) == 1 and gematria_list[0] == 0:
                gematria_list[0] += 1
            # make a dict with the keys being the numbers of seifim and the value being the string of that seif
            seifim_dict = dict(zip(gematria_list, simanim[index]))
            # convert our dict with each seif having a corresponding key into a list of seifim which will now be padded
            simanim[index] = convert_dict_to_array(seifim_dict)
    return simanim
예제 #14
0
 def _parse(self):
     book = self._important_lines
     for chapter in book.keys():
         book[chapter] = util.convert_dict_to_array(book[chapter])
     book = util.convert_dict_to_array(book)
     return book
예제 #15
0
 def _parse(self):
     book = self._important_lines
     for chapter in book.keys():
         book[chapter] = util.convert_dict_to_array(book[chapter])
     book = util.convert_dict_to_array(book)
     return book
예제 #16
0
def parse(html_page, csv_page):


    def end_of_intro(html_fragment):
        soup = html_fragment
        if soup["class"][0] == u"_-מספר-עמוד":
            return True
        return False


    def contains_loc(html_fragment):
        soup = html_fragment
        if soup["class"][0] == u"_-מספר-עמוד" and re.match(u'פרק', soup.text):
            return True
        return False


    def contains_range(html_fragment):
        soup = html_fragment
        if re.search(u'משניות', soup.text):
            return True
        return False


    def contains_headline(html_fragment):
        soup = html_fragment
        if soup["class"][0] == u"_-כותרת-ירוקה":
            return True
        return False


    def contains_mishna(html_fragment):
        soup = html_fragment
        if soup.find('span', {"class": u"_-פירוש-בירוק"}):
            return True
        return False


    def contains_commentary(html_fragment):
        soup = html_fragment
        if soup["class"][0] == u"_טקסט-רץ":
            return True
        return False


    def get_chapter(html_fragment):
        soup = html_fragment
        return unicode(gematria(re.search(u'פרק (.)', soup.text).group(1)))

    def get_mishna(html_fragment):
        soup = html_fragment
        if contains_range(soup):
            return unicode(gematria(re.search(u'משניות (.*)-.*', soup.text).group(1)))
        return unicode(gematria(re.search(u'משנה (.*)', soup.text).group(1)))


    def get_loc(html_fragment):
        soup = html_fragment
        location = {
            'map': get_chapter(soup) + u':' + get_mishna(soup),
            'ch': get_chapter(soup),
            'mishna': get_mishna(soup)
        }
        return location


    def convert_to_vilna(vilna_string):
        location = {
            'map': vilna_string,
            'ch': vilna_string[:1],
            'mishna': re.search(u':(.*)', vilna_string).group(1)
        }
        return location

    intro, first_title, only_commentary = True, True, True
    links, intro_text, chapters, mishnayot, segments = [], [], {}, {}, []
    cur_loc = {'map': u'1:1', 'ch': u'1', 'mishna': u'1'}

    infile = io.open(csv_page, 'r')
    reader = csv.reader(infile)
    mishna_map = dict((row[0], row[1]) for row in reader)
    infile.close()

    infile = io.open(html_page, 'r')
    soup = BeautifulSoup(infile, 'html5lib')
    infile.close()

    for p in soup.find_all('p'):
        if intro:
            if end_of_intro(p):
                intro = False
            else:
                intro_text.append(p.text.strip())
            continue

        if contains_loc(p):
            new_loc = get_loc(p)
            # reconcile clashes between shinan / vilna strutures
            if mishna_map[new_loc['map']] != new_loc['map']:
                new_loc = convert_to_vilna(mishna_map[new_loc['map']])

            # store previous mishna
            if cur_loc['mishna'] != new_loc['mishna'] and segments:
                if only_commentary:
                    pirkei_ref = u"Pirkei Avot " + cur_loc['map']
                    shinan_ref = u"A New Israeli Commentary on Pirkei Avot {}:1-{}".format(cur_loc['map'], unicode(len(segments)))
                    links.append({
                        'refs': [pirkei_ref, shinan_ref],
                        'type': 'commentary',
                        'auto': True,
                        'generated_by': 'Shinan on Avot parser'
                    })
                mishnayot[int(cur_loc['mishna'])] = segments
                segments = []
            # store previous chapter
            if cur_loc['ch'] != new_loc['ch'] and mishnayot:
                chapters[int(cur_loc['ch'])] = mishnayot
                mishnayot = {}
            cur_loc = new_loc

            first_title = True
            only_commentary = True
            if segments:
                start = unicode(len(segments)+1)
            else:
                start = u'1'

        if contains_headline(p):
            if first_title:
                pirkei_ref = u"Pirkei Avot " + cur_loc['map']
                shinan_ref = u"A New Israeli Commentary on Pirkei Avot {}:{}-{}".format(cur_loc['map'], start, unicode(len(segments)))
                links.append({
                    'refs': [pirkei_ref, shinan_ref],
                    'type': 'commentary',
                    'auto': True,
                    'generated_by': 'Shinan on Avot parser'
                })
                first_title = False
                only_commentary = False
            chunk = p.text.replace(p.text, u"<b>" + p.text.strip() + "</b>")
            segments.append(chunk)
        elif contains_mishna(p):

            for child in p.children:
                if isinstance(child, NavigableString):
                    if child == u' ' or child == u'.':
                        if not isinstance(child.previous_sibling, NavigableString):
                            child.previous_sibling.string += child
                        else:
                            child.previous_sibling += child

            chunk = u''
            for child in p.children:
                if isinstance(child, NavigableString):
                    chunk += unicode(child)
                elif child["class"][0] == u"CharOverride-25":
                    chunk += child.text
                elif child["class"][0] == u"_-פירוש-בירוק":
                    if child.text == u' ' or child.text == u'.':
                        continue
                    if child.previous_sibling is None:
                        if isinstance(child.next_sibling, NavigableString):
                            chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + u'.</b> ')
                        elif child.next_sibling["class"][0] == u'_-':
                            chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + u'.</b> ')
                        elif child.next_sibling["class"][0] == u'_-פירוש-בירוק':
                            chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + child.next_sibling.text.strip() + u'.</b> ')
                    elif isinstance(child.previous_sibling, NavigableString):
                        if not isinstance(child.previous_sibling.previous_sibling, NavigableString):
                            if child.previous_sibling.previous_sibling is not None:
                                if child.previous_sibling.previous_sibling["class"][0] == u'_-פירוש-בירוק':
                                    continue
                        segments.append(chunk)
                        if isinstance(child.next_sibling, NavigableString):
                            if child.next_sibling != u" ":
                                print(child)
                            elif child.next_sibling.next_sibling["class"][0] == u'_-פירוש-בירוק':
                                chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + child.next_sibling + child.next_sibling.next_sibling.text.strip() + u'.</b> ')
                            elif child.next_sibling.next_sibling["class"][0] == u'_-':
                                chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + u'.</b> ')
                        elif child.next_sibling["class"][0] == u'_-':
                            chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + u'.</b> ')
                        elif child.next_sibling["class"][0] == u'_-פירוש-בירוק':
                            chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + child.next_sibling.text.strip() + u'.</b> ')
                    elif child.previous_sibling["class"][0] == u'_-פירוש-בירוק':
                        continue
                    elif child.previous_sibling["class"][0] == u"CharOverride-25":
                        segments.append(chunk)
                        if child.next_sibling["class"][0] == u'_-':
                            chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + u'.</b> ')
                        elif child.next_sibling["class"][0] == u'_-פירוש-בירוק':
                            chunk = child.text.replace(child.text, u'<b>' + child.text.strip() + child.next_sibling.text.strip() + u'.</b> ')
            else:
                segments.append(chunk)

        elif contains_commentary(p):
            chunk = p.text.strip()
            segments.append(chunk)
    else:
        pirkei_ref = u"Pirkei Avot " + cur_loc['map']
        shinan_ref = u"A New Israeli Commentary on Pirkei Avot {}:1-{}".format(cur_loc['map'], unicode(len(segments)))
        links.append({
            'refs': [pirkei_ref, shinan_ref],
            'type': 'commentary',
            'auto': True,
            'generated_by': 'Shinan on Avot parser'
        })
        mishnayot[int(cur_loc['mishna'])] = segments
        chapters[int(cur_loc['ch'])] = mishnayot

    for chapter in chapters.keys():
        chapters[chapter] = util.convert_dict_to_array(chapters[chapter])
    chapters = util.convert_dict_to_array(chapters)

    output = {
        "intro": intro_text,
        "content": chapters,
        "links": links
    }

    return output