Python JaggedArray示例，sefaria.datatype.jagged_array.JaggedArray Python示例

示例#1

0

显示文件

    def parse_unlinked(self):
        parsed = JaggedArray([[[]]])
        comment_counter = Counter()

        for chapter in self.get_chapter():
            chap_num = chapter.num
            for phrase in chapter.get_phrase():
                phrase_num = phrase.subchap

                if phrase_num is None:
                    raise AttributeError(
                        u'Unlabeled phrase in {} chapter {}'.format(
                            self.get_author(), chap_num))

                comment_number = comment_counter[(chap_num, phrase_num)]
                parsed.set_element(
                    [int(chap_num) - 1,
                     int(phrase_num) - 1, comment_number], phrase.as_string())
                comment_counter[(chap_num, phrase_num)] += 1

                unlinkedCommentStore.append({
                    'commentator':
                    commentatorNames[self.get_author()],
                    'chapter':
                    chap_num,
                    'verse':
                    phrase_num,
                    'order':
                    str(comment_number + 1)
                })
        return parsed.array()

示例#2

0

显示文件

def parse_en(filename):
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    ja = JaggedArray([[[[]]]])
    placing = u'(\s*[0-9]{1,2}),([0-9]{1,2})-?[0-9]*\.'  # the regex to find the indexing on Monk
    # q1, q2 = ur'“', ur'”' # Rabbi Monk uses these to enclose translation of a pasuk
    # dh_reg = ur'([\u05d0 - \u05ea]*), *({}.*?{})'.format(q1, q2)
    replace_dict = {placing: u'', u'@': ''}
    temp = []
    indices = [0] * 3
    for line in lines:
        pasuk_dh = re.match(placing, line)
        reg_dh = re.search(
            ur'@([\u05d0-\u05ea|\\s]*)',
            line)  #  reg_dh = re.search(ur'([\u05d0-\u05ea]+, *“.*?”)',line)
        line = multiple_replace(line, replace_dict, using_regex=True)
        if pasuk_dh or reg_dh:
            temp = ' '.join(temp)
            ja.set_element(indices, temp, [])
            temp = []
            if pasuk_dh:
                indices = [
                    int(pasuk_dh.group(1)) - 1,
                    int(pasuk_dh.group(2)) - 1, indices[2]
                ]
                indices[2] = 0
            elif reg_dh:
                indices[2] += 1
        if not line.isspace() and not re.match(
                ur' *Parshat *(\S+) *(\S+)? *',
                line):  # don't put into array names of Parasha or empty lines
            temp.append(line)

示例#3

0

显示文件

def parse():
    with codecs.open('pardes_rimonim.html', 'r', 'windows-1255') as infile:
        lines = infile.readlines()
    gate, chapter, whole_text = -1, -1, []
    root = JaggedArray([[]])
    found_beginning = False
    beginning = re.compile(ur'^<b>\u05e9\u05e2\u05e8 ([\u05d0-\u05ea]{1,2}) \u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})')

    for line in lines:
        match = beginning.search(line)
        if match:
            if found_beginning:
                if re.search(ur'^\u05e4\u05e8\u05e7', whole_text[0]):  # strip out some unnecessary text
                    root.set_element([gate, chapter], whole_text[1:], pad=[])
                else:
                    root.set_element([gate, chapter], whole_text, pad=[])
                whole_text = []
            else:
                found_beginning = True
            new_gate, new_chapter = getGematria(match.group(1))-1, getGematria(match.group(2))-1
            if new_gate - gate > 1 or new_chapter - chapter > 1:
                print 'skip found at Gate {} Chapter {}'.format(new_gate+1, new_chapter+1)
            gate, chapter = new_gate, new_chapter

        elif found_beginning:
            if re.search(ur'<img', line):
                whole_text[-1] = add_image(line, whole_text[-1])
                continue

示例#4

0

显示文件

 def parse_linked(self):
     parsed = JaggedArray([[[]]])
     for phrase in self.get_phrase():
         indices = (commentStore[phrase.id]['chapter'], commentStore[phrase.id]['verse'], commentStore[phrase.id]['order'])
         text = phrase.get_comment().valueOf_.replace(u'\n', u'')
         parsed.set_element([i-1 for i in indices], text)
     return parsed.array()

示例#5

0

显示文件

文件： bekhor_shor_parser.py 项目： JonMosenkis/Sefaria-Data

def file_to_ja_g(depth, infile, expressions, cleaner,grab_all=False):
    """
    Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged
    Array in the desired structure (Chapter, verse, etc.)
    This function is a modulation of the origanal file_to_ja because it deals with gimatria letters
    so to place the correct chapters and segments in the currect places according to the hebrew letter numbering.
    Ofcourse it also puts in the padding where needed. (_g stands for Gimatria.
    :param depth: depth of the JaggedArray.
    :param infile: Text file to read from
    :param expressions: A list of regular expressions with which to identify section (chapter) level. Do
    not include an expression with which to break up the segment levels.
    :param cleaner: A function that takes a list of strings and returns an array with the text parsed
    correctly. Should also break up and remove unnecessary tagging data.
    :param grab_all: If set to true, will grab the lines indicating new sections.
    :return: A jagged_array with the text properly structured.
    """

    # instantiate ja
    # structure = reduce(lambda x,y: [x], range(depth-1), [])
    # ja = JaggedArray(structure)
    ja = JaggedArray([])
    # ensure there is a regex for every level except the lowest
    if depth - len(expressions) != 1:
        raise AttributeError('Not enough data to parse. Need {} expressions, '
                             'received {}'.format(depth-1, len(expressions)))

    # compile regexes, instantiate index list
    regexes, indices = [re.compile(ex) for ex in expressions], [-1]*len(expressions)
    temp = []

    # loop through file
    for line in infile:

        # check for matches to the regexes
        for i, reg in enumerate(regexes):
            found = reg.search(line)
            if found:
                # check that we've hit the first chapter and verse
                if indices.count(-1) == 0:
                    ja.set_element(indices, cleaner(temp), [])
                    temp = []

                    if grab_all:
                        temp.append(line)
                gimt = getGematria(found.group('gim'))
                if gimt != 0:
                    indices[i] = gimt - 1
                else:
                    indices[i] += 1
                indices[i+1:] = [-1 if x >= 0 else x for x in indices[i+1:]]
                break

        else:
            if indices.count(-1) == 0:
                temp.append(line)
    else:
        ja.set_element(indices, cleaner(temp), [])

    return ja

示例#6

0

显示文件

 def parse_linked(self):
     parsed = JaggedArray([[[]]])
     for phrase in self.get_phrase():
         indices = (commentStore[phrase.id]['chapter'],
                    commentStore[phrase.id]['verse'],
                    commentStore[phrase.id]['order'])
         text = phrase.get_comment().valueOf_.replace(u'\n', u' ')
         text = re.sub(u' +', u' ', text)
         text = re.sub(ur' (:|\.)', ur'\1', text)
         parsed.set_element([i - 1 for i in indices], text)
     return parsed.array()

示例#7

0

显示文件

def parse(filename):
    comment_store = populate_comment_store(filename)
    parsed = JaggedArray([[]])

    with open(filename) as infile:
        soup = BeautifulSoup(infile, 'xml')
    footnotes = soup.find_all('ftnote')
    for footnote in footnotes:
        loc = comment_store.get(footnote.attrs['id'])
        if loc is None:
            continue
        value = u''.join([unicode(child) for child in footnote.children])
        parsed.set_element([loc['chapter']-1, loc['verse']-1], structure_comments(value), pad=[])
    return parsed.array()

示例#8

0

显示文件

文件： rambam_talmudic.py 项目： JonMosenkis/Sefaria-Data

def parse_file(filename):
    with codecs.open(filename, 'r', 'utf-8') as infile:
        lines = infile.readlines()
    jagged_array = JaggedArray([[[]]])

    segment = RambamSegment()
    for line in lines:
        if segment.is_quote(line):
            segment.add_raw_quote(line)
        elif segment.is_text(line):
            segment.add_text(line)
            segment.add_segment(jagged_array)

    return {'parsed text': jagged_array.array(),
            'links': segment.extract_links()}

示例#9

0

显示文件

def parse(filename):
    with codecs.open(filename, 'r', 'utf-8') as infile:
        lines = infile.readlines()

    targum_ja = JaggedArray([[[]]])
    indices = None

    for line_num, line in enumerate(lines):

        if (line_num + 1) % 2 == 1:
            indices = ref_to_indices(line)
        else:
            text_value = u' '.join(line.split(u' ')[1:])
            targum_ja.set_element(indices, text_value)
    return targum_ja.array()

示例#10

0

显示文件

def parse():
    with codecs.open("hebrew_or_neerav.html", "r", "windows-1255") as infile:
        lines = infile.readlines()
    gate, chapter, whole_text = -1, -1, []
    root = JaggedArray([[]])
    found_beginning = False
    next_line_subject = False
    subjects = []
    main_pattern = re.compile(ur"^<b>חלק ([\u05d0-\u05ea]{1,2}) פרק ([\u05d0-\u05ea]{1,2})")

    for index, line in enumerate(lines):
        line = line.replace("(", "(<sub>")
        line = line.replace(")", "</sub>)")
        if next_line_subject == True:
            subjects.append(line)
            next_line_subject = False
            continue
        if line.find(u"חלק שביעי חלק הכינויים א") >= 0:
            return dealWithEnd(lines[index + 1], lines[index + 2 :], root, subjects)
        main_match = main_pattern.search(line)
        if main_match:
            if found_beginning:
                root.set_element([gate, chapter], whole_text, pad=[])
                whole_text = []
            else:
                found_beginning = True
            new_gate, new_chapter = getGematria(main_match.group(1)) - 1, getGematria(main_match.group(2)) - 1
            if new_gate - gate > 1 or new_chapter - chapter > 1:
                print "skip found at Gate {} Chapter {}".format(new_gate + 1, new_chapter + 1)
            gate, chapter = new_gate, new_chapter
        elif found_beginning:
            if len(line.split(" ")) == 2 and line.find(u"חלק") >= 0:
                next_line_subject = True
                continue
            if len(line.split(" ")) == 2 and line.find(u"פרק") >= 0:
                continue
            line = bleach.clean(line, tags=[], strip=True)
            if line.isspace():
                continue
            line = re.sub(u"(\n|\r)", u"", line)
            whole_text.append(line)
        else:
            continue
    else:
        root.set_element([gate, chapter], whole_text)

示例#11

0

显示文件

def hagahot_alignment(ja_smk, ja_raph, ja_hagahot):
    ja_smk = JaggedArray(ja_smk)
    ja_raph = JaggedArray(ja_raph)
    ja_hagahot = JaggedArray(ja_hagahot)
    # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())):
    dict_lst = []
    dict = {u'siman': [], u'smk': [], u'raph': []}
    for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())):
        # print numToHeb(i+1)
        dict['siman'] = numToHeb(i + 1)
        for i, smk_line in enumerate(seg[0]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)',
                                  smk_line)
            if hag_lett:
                dict['smk'].extend([(hag_l, i + 1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        # print 'RAPH'
        for i, raph_line in enumerate(seg[1]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)',
                                  raph_line)
            if hag_lett:
                dict['raph'].extend([(hag_l, i + 1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        dict_lst.append(dict)
        dict = {u'siman': [], u'smk': [], u'raph': []}
    return dict_lst

示例#12

0

显示文件

def hagahot_parse(ja_hagahot, hagahot_dict_lst):
    def num_haghot_in_siman(siman_dict):
        return len(siman_dict['smk']) + len(siman_dict['raph'])

    ja_hagahot = JaggedArray(ja_hagahot)
    ja_hagahot = ja_hagahot.flatten_to_array()
    hg_ja = []
    p_hg = 0
    for dict in hagahot_dict_lst:
        if re.search(u"^@[^1]", ja_hagahot[p_hg]):
            p_hg += 1
        p_hg_end = p_hg + num_haghot_in_siman(dict)
        hg_ja.append(ja_hagahot[p_hg:p_hg_end])
        p_hg = p_hg_end
    hg_ja.append(ja_hagahot[p_hg::])

    ja_to_xml(hg_ja, ['siman', 'letter'], 'haghot_by_smk_simanim.xml')
    return hg_ja

示例#13

0

显示文件

文件： semak_parser.py 项目： JonMosenkis/Sefaria-Data

def hagahot_parse(ja_hagahot, hagahot_dict_lst):

    def num_haghot_in_siman(siman_dict):
        return len(siman_dict['smk']) + len(siman_dict['raph'])

    ja_hagahot = JaggedArray(ja_hagahot)
    ja_hagahot = ja_hagahot.flatten_to_array()
    hg_ja = []
    p_hg = 0
    for dict in hagahot_dict_lst:
        if re.search(u"^@[^1]", ja_hagahot[p_hg]):
            p_hg += 1
        p_hg_end = p_hg + num_haghot_in_siman(dict)
        hg_ja.append(ja_hagahot[p_hg:p_hg_end])
        p_hg = p_hg_end
    hg_ja.append(ja_hagahot[p_hg::])

    ja_to_xml(hg_ja, ['siman', 'letter'], 'haghot_by_smk_simanim.xml')
    return hg_ja

示例#14

0

显示文件

文件： restructure_derech_chaim.py 项目： JonMosenkis/Sefaria-Data

def restructure_text():
    with open('Derech Chaim text.json') as infile:
        version = json.load(infile)
    my_text = version['text'][u'']

    pattern = re.compile(u'^\u05de\u05e9\u05e0\u05d4 ([\u05d0-\u05ea]{1,2})$')
    parsed = JaggedArray([[[]]])
    for chap_index, chapter in enumerate(my_text):
        current_mishnah, current_comment = 0, 0

        for line in chapter:
            match  = pattern.search(line)
            if match is None:  # This is a regular comment
                parsed.set_element([chap_index, current_mishnah, current_comment], line, pad=[])
                current_comment += 1
            else:
                m_value = getGematria(match.group(1)) - 1
                if m_value > current_mishnah:  # This condition allows for intro text to appear before first mishnah mark
                    current_mishnah = m_value
                    current_comment = 0
    return parsed.array()

示例#15

0

显示文件

def parse():
    with codecs.open('pardes_rimonim.html', 'r', 'windows-1255') as infile:
        lines = infile.readlines()
    gate, chapter, whole_text = -1, -1, []
    root = JaggedArray([[]])
    found_beginning = False
    beginning = re.compile(
        ur'^<b>\u05e9\u05e2\u05e8 ([\u05d0-\u05ea]{1,2}) \u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})'
    )

    for line in lines:
        match = beginning.search(line)
        if match:
            if found_beginning:
                if re.search(ur'^\u05e4\u05e8\u05e7',
                             whole_text[0]):  # strip out some unnecessary text
                    root.set_element([gate, chapter], whole_text[1:], pad=[])
                else:
                    root.set_element([gate, chapter], whole_text, pad=[])
                whole_text = []
            else:
                found_beginning = True
            new_gate, new_chapter = getGematria(
                match.group(1)) - 1, getGematria(match.group(2)) - 1
            if new_gate - gate > 1 or new_chapter - chapter > 1:
                print 'skip found at Gate {} Chapter {}'.format(
                    new_gate + 1, new_chapter + 1)
            gate, chapter = new_gate, new_chapter

        elif found_beginning:
            if re.search(ur'<img', line):
                whole_text[-1] = add_image(line, whole_text[-1])
                continue

示例#16

0

显示文件

def file_to_ja_g(depth, infile, expressions, cleaner,grab_all=False):
    """
    Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged
    Array in the desired structure (Chapter, verse, etc.)
    This function is a modulation of the origanal file_to_ja because it deals with gimatria letters
    so to place the correct chapters and segments in the currect places according to the hebrew letter numbering.
    Ofcourse it also puts in the padding where needed. (_g stands for Gimatria.
    :param depth: depth of the JaggedArray.
    :param infile: Text file to read from
    :param expressions: A list of regular expressions with which to identify section (chapter) level. Do
    not include an expression with which to break up the segment levels.
    :param cleaner: A function that takes a list of strings and returns an array with the text parsed
    correctly. Should also break up and remove unnecessary tagging data.
    :param grab_all: If set to true, will grab the lines indicating new sections.
    :return: A jagged_array with the text properly structured.
    """

    # instantiate ja
    # structure = reduce(lambda x,y: [x], range(depth-1), [])
    # ja = JaggedArray(structure)
    ja = JaggedArray([])
    # ensure there is a regex for every level except the lowest
    if depth - len(expressions) != 1:
        raise AttributeError('Not enough data to parse. Need {} expressions, '
                             'received {}'.format(depth-1, len(expressions)))

    # compile regexes, instantiate index list
    regexes, indices = [re.compile(ex) for ex in expressions], [-1]*len(expressions)
    temp = []

    # loop through file
    for line in infile:

        # check for matches to the regexes
        for i, reg in enumerate(regexes):
            found = reg.search(line)
            if found:
                # check that we've hit the first chapter and verse
                if indices.count(-1) == 0:
                    ja.set_element(indices, cleaner(temp), [])
                    temp = []

                    if grab_all:
                        temp.append(line)
                gimt = getGematria(found.group('gim'))
                if gimt != 0:
                    indices[i] = gimt - 1
                else:
                    indices[i] += 1
                indices[i+1:] = [-1 if x >= 0 else x for x in indices[i+1:]]
                break

        else:
            if indices.count(-1) == 0:
                temp.append(line)
    else:
        ja.set_element(indices, cleaner(temp), [])

    return ja

示例#17

0

显示文件

文件： DCXMLsubs.py 项目： JonMosenkis/Sefaria-Data

    def parse_unlinked(self):
        parsed = JaggedArray([[[]]])
        comment_counter = Counter()

        for chapter in self.get_chapter():
            chap_num = chapter.num
            for phrase in chapter.get_phrase():
                phrase_num = phrase.subchap

                if phrase_num is None:
                    raise AttributeError(u'Unlabeled phrase in {} chapter {}'.format(self.get_author(), chap_num))

                comment_number = comment_counter[(chap_num, phrase_num)]
                parsed.set_element([int(chap_num)-1, int(phrase_num)-1, comment_number], phrase.as_string())
                comment_counter[(chap_num, phrase_num)] += 1

                unlinkedCommentStore.append({
                    'commentator': commentatorNames[self.get_author()],
                    'chapter': chap_num,
                    'verse': phrase_num,
                    'order': str(comment_number+1)
                })
        return parsed.array()

示例#18

0

显示文件

文件： restructure_derech_chaim.py 项目： maxrabin/Sefaria-Data

def restructure_text():
    with open('Derech Chaim text.json') as infile:
        version = json.load(infile)
    my_text = version['text'][u'']

    pattern = re.compile(u'^\u05de\u05e9\u05e0\u05d4 ([\u05d0-\u05ea]{1,2})$')
    parsed = JaggedArray([[[]]])
    for chap_index, chapter in enumerate(my_text):
        current_mishnah, current_comment = 0, 0

        for line in chapter:
            match = pattern.search(line)
            if match is None:  # This is a regular comment
                parsed.set_element(
                    [chap_index, current_mishnah, current_comment],
                    line,
                    pad=[])
                current_comment += 1
            else:
                m_value = getGematria(match.group(1)) - 1
                if m_value > current_mishnah:  # This condition allows for intro text to appear before first mishnah mark
                    current_mishnah = m_value
                    current_comment = 0
    return parsed.array()

示例#19

0

显示文件

文件： semak_parser.py 项目： JonMosenkis/Sefaria-Data

def hagahot_alignment(ja_smk, ja_raph, ja_hagahot):
    ja_smk = JaggedArray(ja_smk)
    ja_raph = JaggedArray(ja_raph)
    ja_hagahot = JaggedArray(ja_hagahot)
    # for i, seg_smk, j, seg_raph in zip(enumerate(ja_smk.array()), enumerate(ja_raph.array())):
    dict_lst = []
    dict = {u'siman':[], u'smk':[], u'raph':[]}
    for i, seg in enumerate(zip(ja_smk.array(), ja_raph.array())):
        # print numToHeb(i+1)
        dict['siman'] = numToHeb(i+1)
        for i, smk_line in enumerate(seg[0]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', smk_line)
            if hag_lett:
                dict['smk'].extend([(hag_l, i+1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        # print 'RAPH'
        for i, raph_line in enumerate(seg[1]):
            hag_lett = re.findall(ur'@88\((?P<gim>[\u05d0-\u05ea]{1,3})\)', raph_line)
            if hag_lett:
                dict['raph'].extend([(hag_l, i+1) for hag_l in hag_lett])
                # print [getGematria(lett) for lett in hag_lett]
        dict_lst.append(dict)
        dict = {u'siman': [], u'smk': [], u'raph': []}
    return dict_lst

示例#20

0

显示文件

def parser(name):
    with codecs.open('{}.txt'.format(name), 'r', 'utf-8') as infile:
        lines = infile.readlines()
    parsed_text = JaggedArray([[[]]])
    links = []
    chapter, mishnah, comment = -1, -1, -1
    for line in lines:
        if re.match(ur'@00\u05e4\u05e8\u05e7', line) is not None:
            chapter += 1
            comment = -1
            continue

        elif re.match(ur'@22', line) is not None:
            mishnah = getGematria(
                re.match(ur'@22([\u05d0-\u05ea]{1,2})', line).group(1)) - 1
            comment = -1
            continue

示例#21

0

显示文件

def parse():
    with codecs.open('hebrew_or_neerav.html', 'r', 'windows-1255') as infile:
        lines = infile.readlines()
    gate, chapter, whole_text = -1, -1, []
    root = JaggedArray([[]])
    found_beginning = False
    next_line_subject = False
    subjects = []
    main_pattern = re.compile(ur'^<b>חלק ([\u05d0-\u05ea]{1,2}) פרק ([\u05d0-\u05ea]{1,2})')

    for index, line in enumerate(lines):
        line = line.replace("(", "(<sub>")
        line = line.replace(")", "</sub>)")
        if next_line_subject == True:
            subjects.append(line)
            next_line_subject = False
            continue
        if line.find(u"חלק שביעי חלק הכינויים א") >= 0:
            return dealWithEnd(lines[index+1], lines[index+2:], root, subjects)
        main_match = main_pattern.search(line)
        if main_match:
            if found_beginning:
                root.set_element([gate, chapter], whole_text, pad=[])
                whole_text = []
            else:
                found_beginning = True
            new_gate, new_chapter = getGematria(main_match.group(1))-1, getGematria(main_match.group(2))-1
            if new_gate - gate > 1 or new_chapter - chapter > 1:
                print 'skip found at Gate {} Chapter {}'.format(new_gate+1, new_chapter+1)
            gate, chapter = new_gate, new_chapter
        elif found_beginning:
            if len(line.split(" ")) == 2 and line.find(u"חלק") >= 0:
                next_line_subject = True
                continue
            if len(line.split(" ")) == 2 and line.find(u"פרק") >= 0:
                continue
            line = bleach.clean(line, tags=[], strip=True)
            if line.isspace():
                continue
            line = re.sub(u'(\n|\r)', u'', line)
            whole_text.append(line)
        else:
            continue
    else:
        root.set_element([gate, chapter], whole_text)

示例#22

0

显示文件

def parse_shokets():
    with open('chesed_le-avraham.htm') as infile:
        soup = BeautifulSoup(infile, 'html.parser')
    raw_shokets = soup.find('div', class_='shokets').text.splitlines()
    raw_shokets = filter(lambda x: x if len(x) > 0 else None, raw_shokets)

    pattern = ur'(\u05d4\u05e9\u05d5?\u05e7\u05ea [\u05d0-\u05ea]{1,2})( - (.*))?:'
    parsed = JaggedArray([[]])
    shoket, paragraph = -1, -1

    for line in raw_shokets:
        new_section = re.search(pattern, line)
        if new_section is None:
            if shoket >= 0:
                paragraph += 1
                parsed.set_element([shoket, paragraph], line)
        else:
            shoket += 1
            paragraph = -1
            if new_section.group(3) is not None:
                paragraph += 1
                parsed.set_element([shoket, paragraph], u'<b>{}</b>'.format(new_section.group(3)))

    return parsed.array()

示例#23

0

显示文件

文件： parse.py 项目： JonMosenkis/Sefaria-Data

def jaggedarray_from_files(input_file, footnote_file):
    """
    :param input_file: Main text file to parse
    :param footnote_file: Footnote text file to parase
    :return: A 3D jaggedArray of text from files.
    """

    ja = JaggedArray([[]])
    global footnotes
    global footnotes_parasha
    global link_refs
    link_refs = []
    current = []
    list_of_currents = []
    footnotes = []
    footnotes_parasha = {}
    links = []

    text = codecs.open(footnote_file, 'r', 'utf-8')
    for line in text:
        footnotes.append(cleanup(line))
    text.close()
    footnotes = iter(footnotes)
    main_text = codecs.open(input_file, 'r', 'utf-8')

    for line in main_text:
        if line.startswith('@22'):
            while current:
                list_of_currents.append(current)
                current = []
            m = re.search(u'([\u05d0-\u05ea]{1,2}-?[\u05d0-\u05ea]{0,2}), ([\u05d0-\u05ea]{1,2}-?[\u05d0-\u05ea]{0,2})', line)
            # if with semicolon, choose first pasuk ignore second
            location = Ref(u"".join([u"בראשית ", m.group(1), u": ", m.group(2)]))
            link_refs.append(location)
            current.append(footnotify(u''.join([u"<strong>", cleanup(line), u"</strong>"])))
        elif line.startswith('@88'):
            current[-1] += u''.join([u"<sup>*</sup><i class='footnote'>", cleanup(line), u"</i>", "<br>___________<br>"])
        elif line.startswith('@11') or line.startswith('@33'):
            current.append(cleanup(footnotify(line)))
        elif line.startswith('@00'): #move line is None to own condition
            while current:
                list_of_currents.append(current)
                current = []
            while list_of_currents:
                for x in list_of_currents:
                    i = list_of_currents.index(x)
                    location = [link_refs[i].sections[0] - 1, link_refs[i].sections[1] - 1]
                    # if they start on same verse, append array to previous array
                    if link_refs[i].sections[0] == link_refs[i - 1].sections[0] and link_refs[i].sections[1] == link_refs[i - 1].sections[1]:
                        bereshit_ref = link_refs[i].normal()
                        philo_ref = "".join(["The Midrash of Philo ", str(location[0] + 1), ":", str(location[1] + 1), ":", str(len(ja.get_element([location[0], location[1]]))+1), "-", str(len(ja.get_element([location[0], location[1]]))+len(x))])
                        #above line: base first on last number of element len(ja.get_element([location[0], locationo[1]]))
                        links.append((bereshit_ref, philo_ref))
                        ja.get_element([location[0], location[1]]).extend(repeat_footnotify(x))
                    else:
                        bereshit_ref = link_refs[i].normal()
                        philo_ref = "".join(["The Midrash of Philo ", str(location[0] + 1), ":", str(location[1] + 1), ":1-", str(len(x))])
                        links.append((bereshit_ref, philo_ref))
                        ja.set_element([location[0], location[1]], repeat_footnotify(x), pad = [])
                footnotes_parasha.clear()
                current = []
                link_refs = []
                list_of_currents = []


    main_text.close()

    #util.ja_to_xml(ja.array(), ['Chapter', 'Verse','Comment'])
    return ja.array(), links