Python getGematria示例，data_utilities.util.getGematria Python示例

示例#1

0

显示文件

def parse():
    with codecs.open('pardes_rimonim.html', 'r', 'windows-1255') as infile:
        lines = infile.readlines()
    gate, chapter, whole_text = -1, -1, []
    root = JaggedArray([[]])
    found_beginning = False
    beginning = re.compile(
        ur'^<b>\u05e9\u05e2\u05e8 ([\u05d0-\u05ea]{1,2}) \u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})'
    )

    for line in lines:
        match = beginning.search(line)
        if match:
            if found_beginning:
                if re.search(ur'^\u05e4\u05e8\u05e7',
                             whole_text[0]):  # strip out some unnecessary text
                    root.set_element([gate, chapter], whole_text[1:], pad=[])
                else:
                    root.set_element([gate, chapter], whole_text, pad=[])
                whole_text = []
            else:
                found_beginning = True
            new_gate, new_chapter = getGematria(
                match.group(1)) - 1, getGematria(match.group(2)) - 1
            if new_gate - gate > 1 or new_chapter - chapter > 1:
                print 'skip found at Gate {} Chapter {}'.format(
                    new_gate + 1, new_chapter + 1)
            gate, chapter = new_gate, new_chapter

        elif found_beginning:
            if re.search(ur'<img', line):
                whole_text[-1] = add_image(line, whole_text[-1])
                continue

示例#2

0

显示文件

文件： semak_parser.py 项目： JonMosenkis/Sefaria-Data

def link_semak_raph(smk_ja, raph_ja):
    #if segment in smak_ja has a @55[\u05d0-\u05ea]{0,3} extract the letter and match it to the segment in the ja_raph
    #by running on the ja_raph segments
    smk_raph = []
    raph_letter = []
    for seg in traverse_ja(smk_ja):
        if re.search(u'@55[\u05d0-\u05ea]{0,3}', seg['data']):
            for letter in re.findall(u'@55([\u05d0-\u05ea]{0,3})', seg['data']):
                # smk_raph.append([seg['indices'][:], letter])
                smk_raph.append([letter, seg['indices']])
    last = [-1, -1]
    for seg in traverse_ja(raph_ja):
        if seg['indices'][0:2] == last[0:2]:
            continue
        else:
            raph_letter.append(seg)
        last = seg['indices']

    problem_count = 0
    for smk, raph in zip(smk_raph, raph_letter):
        if getGematria(smk[0]) == (raph['indices'][1]+1):
            print getGematria(smk[0]), raph['indices'][1]+1, \
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
        else:
            problem_count +=1
            print 'problem:', getGematria(smk[0]), raph['indices'][1]+1,\
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
    print problem_count

示例#3

0

显示文件

def parse():
    with codecs.open('pardes_rimonim.html', 'r', 'windows-1255') as infile:
        lines = infile.readlines()
    gate, chapter, whole_text = -1, -1, []
    root = JaggedArray([[]])
    found_beginning = False
    beginning = re.compile(ur'^<b>\u05e9\u05e2\u05e8 ([\u05d0-\u05ea]{1,2}) \u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})')

    for line in lines:
        match = beginning.search(line)
        if match:
            if found_beginning:
                if re.search(ur'^\u05e4\u05e8\u05e7', whole_text[0]):  # strip out some unnecessary text
                    root.set_element([gate, chapter], whole_text[1:], pad=[])
                else:
                    root.set_element([gate, chapter], whole_text, pad=[])
                whole_text = []
            else:
                found_beginning = True
            new_gate, new_chapter = getGematria(match.group(1))-1, getGematria(match.group(2))-1
            if new_gate - gate > 1 or new_chapter - chapter > 1:
                print 'skip found at Gate {} Chapter {}'.format(new_gate+1, new_chapter+1)
            gate, chapter = new_gate, new_chapter

        elif found_beginning:
            if re.search(ur'<img', line):
                whole_text[-1] = add_image(line, whole_text[-1])
                continue

示例#4

0

显示文件

文件： scraper_chinukh_rambam.py 项目： Sefaria/Sefaria-Data

def scrape_wiki():
    url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A"

    page = requests.get(url)
    soup_body = BeautifulSoup(page.text, "lxml")
    tables = soup_body.select(".mw-parser-output > table")

    pairs = []
    links = []

    for table in tables:
        table_tr = table.select("tr")
        for col in table_tr:
            pairs.append((col.contents[1].text.strip(), re.sub(u'</?td>', u'', col.contents[-1].text).strip()))

    for pair in pairs:
        if re.search(u'ספר|מספר', pair[0]):
            continue
        neg_pos = u"Negative Mitzvot" if re.search(u"לאו", pair[1]) else u'Positive Mitzvot'
        rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip())
        chinukh = getGematria(pair[0])
        print chinukh, rambam
        chinukh_simanlen = len(Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs())
        print neg_pos
        link = ({"refs": [
            u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1, chinukh_simanlen),
            u'Mishneh Torah, {}.{}'.format(neg_pos, rambam)
        ],
            "type": "Sifrei Mitzvot",
            "auto": True,
            "generated_by": "chinukh_rambam_sfm_linker"  # _sfm_linker what is this parametor intended to be?
        })
        print link['refs']
        links.append(link)
        return links

示例#5

0

显示文件

文件： semak_parser.py 项目： JonMosenkis/Sefaria-Data

def parse_Raph_simanim(alinged_list):
    '''
    note: although there is (not often) a differentiation in the original txt file,
    raph letters can be divided into smaller segments. In this code we combined those segments.
    returning, every raph letter as a line.
    '''
    ja = []
    siman = []
    i = 1
    prev_siman = u'א'
    for obj in alinged_list:
        if obj['siman'] == prev_siman:
          siman.append(obj['raph'])
          continue
        else:
            ja.append(siman)
            while getGematria(obj['siman']) != (getGematria(prev_siman) + i):
                ja.append([])
                i += 1
            i = 1
            siman = []
            siman.append(obj['raph'])
        prev_siman = obj['siman']
    ja.append(siman)
    ja_to_xml(ja, ['siman', 'letter'], 'raph_simanim.xml')
    return ja

示例#6

0

显示文件

文件： rambam_mishnaic.py 项目： JonMosenkis/Sefaria-Data

def xmlify(filename):
    """
    create an xml representation of the text files
    :param filename: str name of file
    """
    with codecs.open(filename, 'r', 'utf-8') as infile:
        raw_rambam = infile.read()

    chap_index = [getGematria(i.group(1)) for i in re.finditer(ur'@00\u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})', raw_rambam)]
    chapters = re.split(ur'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}', raw_rambam)[1:]
    assert len(chap_index) == len(chapters)

    soup = BeautifulSoup(u'<root></root>', 'xml')
    for index, chapter in zip(chap_index, chapters):
        x_chapter = soup.new_tag('chapter', num=unicode(index))
        soup.root.append(x_chapter)

        v_indices = [getGematria(i.group(1)) for i in re.finditer(ur'@22([\u05d0-\u05ea]{1,2})', chapter)]
        verses = re.split(ur'@22[\u05d0-\u05ea]{1,2}', chapter)[1:]
        assert len(v_indices) == len(verses)

        for v_index, verse in zip(v_indices, verses):
            x_verse = soup.new_tag('verse', num=unicode(v_index))
            comments = verse.splitlines()
            for i, comment in enumerate(comments[1:]):
                x_comment = soup.new_tag('comment', num=unicode(i+1))
                x_comment.append(comment)
                x_verse.append(x_comment)

            x_chapter.append(x_verse)
    with codecs.open('./xml/{}'.format(filename.replace('.txt', '.xml')), 'w', 'utf-8') as outfile:
        outfile.write(unicode(soup.prettify()))

示例#7

0

显示文件

def parse_Raph_simanim(alinged_list):
    '''
    note: although there is (not often) a differentiation in the original txt file,
    raph letters can be divided into smaller segments. In this code we combined those segments.
    returning, every raph letter as a line.
    '''
    ja = []
    siman = []
    i = 1
    prev_siman = u'א'
    for obj in alinged_list:
        if obj['siman'] == prev_siman:
            siman.append(obj['raph'])
            continue
        else:
            ja.append(siman)
            while getGematria(obj['siman']) != (getGematria(prev_siman) + i):
                ja.append([])
                i += 1
            i = 1
            siman = []
            siman.append(obj['raph'])
        prev_siman = obj['siman']
    ja.append(siman)
    ja_to_xml(ja, ['siman', 'letter'], 'raph_simanim.xml')
    return ja

示例#8

0

显示文件

文件： mitzvot_link_scraper.py 项目： JonMosenkis/Sefaria-Data

def siman_smk_exctractor(smk_text):

    split = re.split(u'\s', smk_text)
    simanim = []
    for word in split:
        if not word or word == u'סימן' or word == u'סעיף':
            continue
        word = re.sub(u"[;.,']", u"", word)
        if re.search(u'-', word):
            borders = re.search(u"(.*?)-(.*)", word)
            start = getGematria(borders.group(1))
            end = getGematria(borders.group(2))
            for siman in range(start, end+1):
                simanim.append(siman)
        if not is_hebrew_number(word):
            if not check_vav(word):
                # print smk_text, simanim
                return simanim
            else:
                simanim.append(check_vav(word))
        else:
            smk_siman = getGematria(word)
            simanim.append(smk_siman)
    # print smk_text, simanim
    return simanim

示例#9

0

显示文件

def link_semak_raph(smk_ja, raph_ja):
    #if segment in smak_ja has a @55[\u05d0-\u05ea]{0,3} extract the letter and match it to the segment in the ja_raph
    #by running on the ja_raph segments
    smk_raph = []
    raph_letter = []
    for seg in traverse_ja(smk_ja):
        if re.search(u'@55[\u05d0-\u05ea]{0,3}', seg['data']):
            for letter in re.findall(u'@55([\u05d0-\u05ea]{0,3})',
                                     seg['data']):
                # smk_raph.append([seg['indices'][:], letter])
                smk_raph.append([letter, seg['indices']])
    last = [-1, -1]
    for seg in traverse_ja(raph_ja):
        if seg['indices'][0:2] == last[0:2]:
            continue
        else:
            raph_letter.append(seg)
        last = seg['indices']

    problem_count = 0
    for smk, raph in zip(smk_raph, raph_letter):
        if getGematria(smk[0]) == (raph['indices'][1] + 1):
            print getGematria(smk[0]), raph['indices'][1]+1, \
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
        else:
            problem_count += 1
            print 'problem:', getGematria(smk[0]), raph['indices'][1]+1,\
                [item+1 for item in smk[1]], [item +1 for item in raph['indices']]
    print problem_count

示例#10

0

显示文件

 def chapter_verse(text_fragment):
     searcher = re.compile(
         u'.*B.*-([\u05d0-\u05ea]{1,2})-\{([\u05d0-\u05ea]{1,2})\}')
     data = searcher.search(text_fragment)
     return {
         'chapter': util.getGematria(data.group(1)),
         'verse': util.getGematria(data.group(2))
     }

示例#11

0

显示文件

def getMishnah(line):
    if line.find("@22") == 0:
        line = line.split(" ")[0].replace("@22", "")
        return getGematria(line)
    else:
        first_word = line.replace("@11", "").split(" ")[0]
        if len(first_word) == 1:
            return getGematria(first_word)
    return None

示例#12

0

显示文件

文件： basic_ein_parser.py 项目： JonMosenkis/Sefaria-Data

def getGematriaVav(str):
    str = str.strip()
    str = re.sub(u'''"|''', u'', str)
    case_set = {270,272,274,275,298,304,344,670,672,698,744} # from trello card 'Letter transpositions'
    if str[0] == u'ו' and (is_hebrew_number(str[1:]) or (getGematria(str[1:]) in case_set)):
        return getGematria(str[1:])
    elif is_hebrew_number(str) or getGematria(str) in case_set: # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str)
        return getGematria(str)
    else:
        # mass.ErrorFile.write('error in pointer, not Gimatria...')
        print 'error in pointer, not Gimatria...', str

示例#13

0

显示文件

    def walk_through_file(self, filename):
        """
        Derive and store references from a single file
        :param filename:
        :return:
        """
        tester = Tester()
        previous_reference, seif = None, 0
        with codecs.open(filename, 'r', 'utf-8') as fp:
            lines = fp.readlines()
        for line in lines:
            if tester(re.search(u'@22([\u05d0-\u05ea]{1,3})', line)):
                siman = getGematria(tester.match.group(1))
                seif = 0
                previous_reference = self.get_default_reference(siman)

            if re.match(u'^@00\(', line):
                seif += 1
                reference = {
                    u'siman': siman,
                    u'local-seif': seif,
                    u'remote-seif': None,
                    u'comments-on': None,
                    u'raw-text': line
                }
                stripped = re.sub(u'[^\u05d0-\u05ea ]', u'', line)
                stripped = re.sub(
                    u'^\u05e1\u05d9(?:\u05de\u05df)?\s([\u05d0-\u05ea]{1,3})\s?',
                    u'', stripped)
                ref_match = self.reference_regex.match(stripped)
                if not ref_match:
                    print u"No match found for:"
                    print line
                    continue
                reference[u'comments-on'] = self.get_commentator(ref_match)
                reference[u'remote-seif'] = \
                    None if ref_match.group(u'seif') is None else getGematria(ref_match.group(u'seif'))
                if reference[u'comments-on'] is None:
                    reference[u'comments-on'] = previous_reference[
                        u'comments-on']
                else:
                    previous_reference[u'comments-on'] = reference[
                        u'comments-on']
                if reference[u'remote-seif'] is None:
                    reference[u'remote-seif'] = previous_reference[
                        u'remote-seif']
                else:
                    previous_reference[u'remote-seif'] = reference[
                        u'remote-seif']
                if reference[u'remote-seif'] is None:
                    print u'No remote seif for {} {}'.format(
                        reference[u'siman'], reference[u'local-seif'])
                self.record_list.append(reference)

示例#14

0

显示文件

def getGematriaVav(str, mass):
    str = str.strip()
    str = re.sub(u'''"|''', u'', str)
    case_set = {270,272,274,275,298,304,344,670,672,698,744} # from trello card 'Letter transpositions'
    if str[0] == u'ו' and (is_hebrew_number(str[1:]) or (getGematria(str[1:]) in case_set)):
        return getGematria(str[1:])
    elif is_hebrew_number(str) or getGematria(str) in case_set: # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str)
        return getGematria(str)
    elif re.search(u'בהגהה?', str): # this is not gimatria but there is no need to send an error about it each time...
        return
    else:
        mass.write_shgia('error in pointer, not Gimatria...'+ str)

示例#15

0

显示文件

文件： ein_parser.py 项目： JonMosenkis/Sefaria-Data

def getGematriaVav(str, mass):
    str = str.strip()
    str = re.sub(u'''"|''', u'', str)
    case_set = {270,272,274,275,298,304,344,670,672,698,744} # from trello card 'Letter transpositions'
    if str[0] == u'ו' and (is_hebrew_number(str[1:]) or (getGematria(str[1:]) in case_set)):
        return getGematria(str[1:])
    elif is_hebrew_number(str) or getGematria(str) in case_set: # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str)
        return getGematria(str)
    elif re.search(u'בהגהה?', str): # this is not gimatria but there is no need to send an error about it each time...
        return
    else:
        mass.write_shgia('error in pointer, not Gimatria...'+ str)

示例#16

0

显示文件

文件： updated_structure_function.py 项目： smontagu/Sefaria-Data

def create_alt_struct_dict(rabbeinu_bahya_text_file, the_regex):
    first_perek, first_pasuk, current_perek, current_pasuk = 0, 0, 0, 0
    second_to_last_pasuk, second_to_last_comment_number = 0, 0
    first_comment_number, current_comment_number = 0, 0
    new_first_perek, new_first_pasuk, new_comment = True, True, True
    list_of_ranges = []

    with codecs.open(rabbeinu_bahya_text_file, 'r', 'utf-8') as the_file:
        for each_line in the_file:


            if "@99" in each_line:
                #list_of_ranges.append('{}.{}.{}-{}.{}.{}'.format(first_perek, first_pasuk, first_comment_number, current_perek, current_pasuk, current_comment_number))
                new_first_perek, new_first_pasuk, right_after_99 = True, True, True
                first_perek = 0

            elif "@00" in each_line:
                list_of_ranges.append('{}.{}.{}-{}.{}.{}'.format(first_perek, first_pasuk, first_comment_number, current_perek, current_pasuk, second_to_last_comment_number))
                new_first_perek, new_first_pasuk = True, True

            elif "@77" in each_line:
                list_of_ranges.append('{}.{}.{}-{}.{}.{}'.format(first_perek, first_pasuk, first_comment_number, current_perek, current_pasuk, second_to_last_comment_number))
                #new_first_perek, new_first_pasuk = True, True

            elif "@01" in each_line:

                matchObject = the_regex.search(each_line)
                if new_first_perek:
                    matchObject = the_regex.search(each_line)
                    first_perek = util.getGematria(matchObject.group(1))
                    new_first_perek = False
                current_perek = util.getGematria(matchObject.group(1))

            elif "@22" in each_line:

                matchObject = the_regex.search(each_line)
                if new_first_pasuk:
                    matchObject = the_regex.search(each_line)
                    first_pasuk = util.getGematria(matchObject.group(1))
                    new_first_pasuk = False
                    new_comment = True
                if new_comment:
                    first_comment_number = current_comment_number
                second_to_last_pasuk = current_pasuk
                current_pasuk = util.getGematria(matchObject.group(1))
                second_to_last_comment_number = current_comment_number
                current_comment_number = 0

            else:
                current_comment_number += 1

    return list_of_ranges

示例#17

0

显示文件

def getGematriaVav(str):
    str = str.strip()
    str = re.sub(u'''"|''', u'', str)
    case_set = {270, 272, 274, 275, 298, 304, 344, 670, 672, 698,
                744}  # from trello card 'Letter transpositions'
    if str[0] == u'ו' and (is_hebrew_number(str[1:]) or
                           (getGematria(str[1:]) in case_set)):
        return getGematria(str[1:])
    elif is_hebrew_number(str) or getGematria(
            str
    ) in case_set:  # and not re.search(u'''מד"ס'''): or re.search(u'''('|")''', str)
        return getGematria(str)
    else:
        # mass.ErrorFile.write('error in pointer, not Gimatria...')
        print 'error in pointer, not Gimatria...', str

示例#18

0

显示文件

文件： siftei_chakhamim.py 项目： joshuagoldmeier/Sefaria-Data

    def grab_rashis(self):

        rashis = []
        for span in self.parsed_html.find_all('span', id='katom'):
            if span.text == u'\n':
                continue

            verse = {'comments': []}

            # grab the verse number
            match = re.search(u'\(([\u05d0-\u05ea]{1,2})\)', span.text)

            if match is None:
                verse['verse_number'] = '<unknown>'

            else:
                verse['verse_number'] = util.getGematria(match.group(1))

            structured_rashi = self.structure_rashi(span.text)
            for line in structured_rashi:
                if line is not u'':
                    # add all Siftei Hakhamim in an array according to each Rashi comment.
                    verse['comments'].append(re.findall(u'\[([\u05d0-\u05ea])\]', line))

            verse['total_rashis'] = len(structured_rashi)

            rashis.append(verse)
        return rashis

示例#19

0

显示文件

    def grab_rashis(self):

        rashis = []
        for span in self.parsed_html.find_all('span', id='katom'):
            if span.text == u'\n':
                continue

            verse = {'comments': []}

            # grab the verse number
            match = re.search(u'\(([\u05d0-\u05ea]{1,2})\)', span.text)

            if match is None:
                verse['verse_number'] = '<unknown>'

            else:
                verse['verse_number'] = util.getGematria(match.group(1))

            structured_rashi = self.structure_rashi(span.text)
            for line in structured_rashi:
                if line is not u'':
                    # add all Siftei Hakhamim in an array according to each Rashi comment.
                    verse['comments'].append(
                        re.findall(u'\[([\u05d0-\u05ea])\]', line))

            verse['total_rashis'] = len(structured_rashi)

            rashis.append(verse)
        return rashis

示例#20

0

显示文件

def find_skips(filename):
    """
    Looks for skipped comments.
    :param filename: File to scan
    """

    parser = TextParser(filename)
    offset = 0
    total_errors = 0
    for chapter in parser.chapter_strings:
        chap_number = util.getGematria(
            parser.chap_reg.search(chapter).group(1))
        if chap_number == 1:
            offset = 0
        comments = parser.comment_reg.findall(chapter)
        comment_values = [letters[comment[1]] for comment in comments]

        sequence = modulo_sequence(comment_values, 22, offset)
        offset = comment_values[-1] + 1

        if sequence['in_order']:
            continue
        else:
            print 'error in chapter {}'.format(chap_number)
            for error in sequence['errors']:
                print 'previous: {} expected: {} found: {}'.format(
                    error['previous'], error['expected'], error['found'])
            total_errors += len(sequence['errors'])
    print 'total errors: {}'.format(total_errors)

示例#21

0

显示文件

文件： parse_yachin.py 项目： stevekaplan123/Sefaria-Data

def align_comments(text_array):
    # strip out unnecessary lines
    remove = re.compile(u'@99')
    for index, line in enumerate(text_array):
        if remove.search(line):
            del text_array[index]

    section_name, result, tmp = '', {}, []
    t = u''.join(text_array)
    t = t.replace(u'\n', u'')
    t = t.replace(u'\r', u'')
    t = t.split(u' ')
    for word in t:
        search = re.search(u'@11([\u05d0-\u05ea"]){1,4}\*?\)', word)
        if search:
            section_name = getGematria(search.group(1).replace(u'"', u''))
            if section_name in result.keys():
                result[section_name].append(u'\n')
        if section_name not in result.keys():
            result[section_name] = []

        result[section_name].append(re.sub(u'@[0-9]{2}', u'', word))

    #


    return result

示例#22

0

显示文件

文件： siftei_chakhamim.py 项目： joshuagoldmeier/Sefaria-Data

def find_skips(filename):
    """
    Looks for skipped comments.
    :param filename: File to scan
    """

    parser = TextParser(filename)
    offset = 0
    total_errors = 0
    for chapter in parser.chapter_strings:
        chap_number = util.getGematria(parser.chap_reg.search(chapter).group(1))
        if chap_number == 1:
            offset = 0
        comments = parser.comment_reg.findall(chapter)
        comment_values = [letters[comment[1]] for comment in comments]

        sequence = modulo_sequence(comment_values, 22, offset)
        offset = comment_values[-1]+1

        if sequence['in_order']:
            continue
        else:
            print 'error in chapter {}'.format(chap_number)
            for error in sequence['errors']:
                print 'previous: {} expected: {} found: {}'.format(
                    error['previous'], error['expected'], error['found'])
            total_errors += len(sequence['errors'])
    print 'total errors: {}'.format(total_errors)

示例#23

0

显示文件

def get_civil_year(year_line, book):
    """
    JN are named by year. The he_title can be lifted directly from the text, this function converts them to English
    equivalent. The conversion is not exact, as an exact mapping of Parsha - Date is not available at this time.
    Therefore, each book will get a "typical" Hebrew data which is used to extract the standard civil date. This may
    contain several errors, which will be corrected down the road.
    :param year_line: A line of text from which year data is extracted. May contain multiple years (i.e. תרל"ז-תרל"ח)
    :param book: What book is this taken from (i.e. Genesis, Exodus etc.).
    :return: civil year(s)
    """

    typical_dates = {
        'Genesis': [7, 1],
        'Exodus': [10, 20],
        'Leviticus': [1, 1],
        'Numbers': [3, 20],
        'Deuteronomy': [5, 1]
    }

    he_years = [
        util.getGematria(match) + 5000
        for match in re.findall(u'[\u05d0-\u05ea"]{4,5}', year_line)
    ]
    en_years = [str(year) for year in he_years]

    return '; '.join(en_years)

示例#24

0

显示文件

文件： chinuch.py 项目： smontagu/Sefaria-Data

def check_segments():

    segments = []

    infile = codecs.open(filename, 'r', 'utf-8')

    headers = TagTester(u'@30', infile,
                        u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header()
    tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)')

    while not tester.eof:

        segments.append(
            tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1))

    infile.close()

    for sec_number, section in enumerate(segments):

        index = 1

        for title in section:

            title = title.replace(u'"', u'')
            count = util.getGematria(title)

            if count != index:

                print headers[sec_number - 1]
                print util.numToHeb(index)
                index = count
            index += 1

示例#25

0

显示文件

文件： chinuch.py 项目： JonMosenkis/Sefaria-Data

def check_segments():

    segments = []

    infile = codecs.open(filename, 'r', 'utf-8')

    headers = TagTester(u'@30', infile, u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header()
    tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)')

    while not tester.eof:

        segments.append(tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1))

    infile.close()

    for sec_number, section in enumerate(segments):

        index = 1

        for title in section:

            title = title.replace(u'"', u'')
            count = util.getGematria(title)

            if count != index:

                print headers[sec_number-1]
                print util.numToHeb(index)
                index = count
            index += 1

示例#26

0

显示文件

文件： bekhor_shor_parser.py 项目： JonMosenkis/Sefaria-Data

def file_to_ja_g(depth, infile, expressions, cleaner,grab_all=False):
    """
    Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged
    Array in the desired structure (Chapter, verse, etc.)
    This function is a modulation of the origanal file_to_ja because it deals with gimatria letters
    so to place the correct chapters and segments in the currect places according to the hebrew letter numbering.
    Ofcourse it also puts in the padding where needed. (_g stands for Gimatria.
    :param depth: depth of the JaggedArray.
    :param infile: Text file to read from
    :param expressions: A list of regular expressions with which to identify section (chapter) level. Do
    not include an expression with which to break up the segment levels.
    :param cleaner: A function that takes a list of strings and returns an array with the text parsed
    correctly. Should also break up and remove unnecessary tagging data.
    :param grab_all: If set to true, will grab the lines indicating new sections.
    :return: A jagged_array with the text properly structured.
    """

    # instantiate ja
    # structure = reduce(lambda x,y: [x], range(depth-1), [])
    # ja = JaggedArray(structure)
    ja = JaggedArray([])
    # ensure there is a regex for every level except the lowest
    if depth - len(expressions) != 1:
        raise AttributeError('Not enough data to parse. Need {} expressions, '
                             'received {}'.format(depth-1, len(expressions)))

    # compile regexes, instantiate index list
    regexes, indices = [re.compile(ex) for ex in expressions], [-1]*len(expressions)
    temp = []

    # loop through file
    for line in infile:

        # check for matches to the regexes
        for i, reg in enumerate(regexes):
            found = reg.search(line)
            if found:
                # check that we've hit the first chapter and verse
                if indices.count(-1) == 0:
                    ja.set_element(indices, cleaner(temp), [])
                    temp = []

                    if grab_all:
                        temp.append(line)
                gimt = getGematria(found.group('gim'))
                if gimt != 0:
                    indices[i] = gimt - 1
                else:
                    indices[i] += 1
                indices[i+1:] = [-1 if x >= 0 else x for x in indices[i+1:]]
                break

        else:
            if indices.count(-1) == 0:
                temp.append(line)
    else:
        ja.set_element(indices, cleaner(temp), [])

    return ja

示例#27

0

显示文件

def file_to_ja_g(depth, infile, expressions, cleaner,grab_all=False):
    """
    Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged
    Array in the desired structure (Chapter, verse, etc.)
    This function is a modulation of the origanal file_to_ja because it deals with gimatria letters
    so to place the correct chapters and segments in the currect places according to the hebrew letter numbering.
    Ofcourse it also puts in the padding where needed. (_g stands for Gimatria.
    :param depth: depth of the JaggedArray.
    :param infile: Text file to read from
    :param expressions: A list of regular expressions with which to identify section (chapter) level. Do
    not include an expression with which to break up the segment levels.
    :param cleaner: A function that takes a list of strings and returns an array with the text parsed
    correctly. Should also break up and remove unnecessary tagging data.
    :param grab_all: If set to true, will grab the lines indicating new sections.
    :return: A jagged_array with the text properly structured.
    """

    # instantiate ja
    # structure = reduce(lambda x,y: [x], range(depth-1), [])
    # ja = JaggedArray(structure)
    ja = JaggedArray([])
    # ensure there is a regex for every level except the lowest
    if depth - len(expressions) != 1:
        raise AttributeError('Not enough data to parse. Need {} expressions, '
                             'received {}'.format(depth-1, len(expressions)))

    # compile regexes, instantiate index list
    regexes, indices = [re.compile(ex) for ex in expressions], [-1]*len(expressions)
    temp = []

    # loop through file
    for line in infile:

        # check for matches to the regexes
        for i, reg in enumerate(regexes):
            found = reg.search(line)
            if found:
                # check that we've hit the first chapter and verse
                if indices.count(-1) == 0:
                    ja.set_element(indices, cleaner(temp), [])
                    temp = []

                    if grab_all:
                        temp.append(line)
                gimt = getGematria(found.group('gim'))
                if gimt != 0:
                    indices[i] = gimt - 1
                else:
                    indices[i] += 1
                indices[i+1:] = [-1 if x >= 0 else x for x in indices[i+1:]]
                break

        else:
            if indices.count(-1) == 0:
                temp.append(line)
    else:
        ja.set_element(indices, cleaner(temp), [])

    return ja

示例#28

0

显示文件

def identify_star_locations(filename):
    def get_regex():
        partial_regexes = [
            u'@12([\u05d0-\u05ea]{1,3})', u'@11([\u05d0-\u05ea])', u'@11(\*)'
        ]
        names = [u'siman', u'seif', u'star']
        my_full_regexes = [
            u'(?P<{}>{})'.format(*i) for i in zip(names, partial_regexes)
        ]
        return re.compile(u'|'.join(my_full_regexes))

    with codecs.open(filename, 'r', 'utf-8') as infile:
        lines = infile.readlines()

    siman, seif_index, seif_letter, num_stars = -1, -1, None, 0
    star_locations, current_star = [], {}
    line_regex = get_regex()

    for line in lines:
        line_data = line_regex.search(line)
        if line_data is None:
            continue

        elif line_data.lastgroup == u'star':
            num_stars += 1
            current_star = {
                u'siman_num': siman,
                u'preceding_index': seif_index,
                u'preceding_letter': seif_letter
            }

        else:
            if line_data.lastgroup == u'seif':
                seif_index += 1
                seif_letter = line_data.group(line_data.lastindex + 1)

            elif line_data.lastgroup == u'siman':
                siman = getGematria(line_data.group(line_data.lastindex + 1))
                seif_index = -1
                seif_letter = None

            else:
                raise LookupError(u"Expecting seif or siman, got {}".format(
                    line_data.lastgroup))

            if num_stars >= 1:
                current_star[u'star_count'] = num_stars
                current_star[u'following_index'] = seif_index
                current_star[u'following_letter'] = seif_letter
                star_locations.append(current_star)
                num_stars = 0
    else:
        if num_stars >= 1:
            current_star[u'star_count'] = num_stars
            current_star[u'following_index'] = 0
            current_star[u'following_letter'] = None
            star_locations.append(current_star)

    return star_locations

示例#29

0

显示文件

文件： ls_functions.py 项目： JonMosenkis/Sefaria-Data

def fill_in_missing_sections_and_update_last(each_line, base_list, this_regex, filler, last_index):
    match_object = this_regex.search(each_line)
    current_index = util.getGematria(match_object.group(1))
    diff = current_index - last_index
    while diff > 1:
        base_list.append(filler)
        diff -= 1
    return current_index

示例#30

0

显示文件

def fill_in_missing_sections_and_updated_last(each_line, base_list, this_regex, filler, last_index):
    match_object = this_regex.search(each_line)
    current_index = util.getGematria(match_object.group(1))
    diff = current_index - last_index
    while diff > 1:
        base_list.append(filler)
        diff -= 1
    return current_index

示例#31

0

显示文件

文件： mitzvot_link_scraper.py 项目： JonMosenkis/Sefaria-Data

def seferHamitzvot_from_rasag_comm(rasagCsvName, with_orig = False):
        # ind_rasag_comm = library.get_index("Commentary on Sefer Hamitzvot of Rasag")
        segments = Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Positive_Commandments').all_segment_refs()
        segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Negative_Commandments').all_segment_refs())
        segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Laws_of_the_Courts').all_segment_refs())
        segments.extend(Ref('Commentary_on_Sefer_Hamitzvot_of_Rasag,_Communal_Laws').all_segment_refs())

        cnt = {"Rasag":0, "Sefer HaMitzvot":0, "Semag":0, "Semak":0}
        dict_list = []
        for seg in segments:
            # sfHmtzvot = re.search(u'(?:ספר המצו?ות|סה"מ).{1,4}(עשין|לאוין|עשה|לא תעשה).{0,20}', seg.text('he').text)
            sfHmtzvot = re.search(u'(?:ספר המצוות|סה"מ)\s{1,4}\((.*?)\)', seg.text('he').text)
            smg = re.search(u'סמ"ג \((.*?)\)', seg.text('he').text)
            smk = re.search(u'סמ"ק (\(.*?\))', seg.text('he').text)
            row_dict = {}
            row_orig = {}
            if sfHmtzvot:
                # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                # row_orig["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                kind, simanim = rasag_exctractor(sfHmtzvot.group(1))
                # row_dict["Sefer HaMitzvot"] = ['Sefer HaMitzvot, {}.{}'.format(kind, siman) for siman in simanim]
                if kind:
                    row_dict["Sefer HaMitzvot"] = 'Sefer HaMitzvot, {}.{}'.format(kind, simanim[0])
                else:
                    print "no kind", sfHmtzvot.group(1)
                row_orig["Sefer HaMitzvot"] = sfHmtzvot.group()
                cnt["Sefer HaMitzvot"] += 1
            if smg:
                # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                kind, simanim = rasag_exctractor(smg.group(1))
                # row_dict["Semag"] = ['Sefer Mitzvot Gadol, {}.{}'.format(kind, siman) for siman in simanim]
                if kind:
                    row_dict["Semag"] = 'Sefer Mitzvot Gadol, {}.{}'.format(kind, simanim[0])
                else:
                    print "no kind", smg.group(1)
                row_orig["Semag"] = smg.group()
                cnt["Semag"] += 1
            if smk:
                # row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                # simanim = siman_smk_exctractor(smk.group(1))
                smki = re.search(u"ב?סי'\s+(.*?)(?:\s*\))", smk.group(1))
                if smki:
                    siman = getGematria(smki.group(1))
                    row_dict["Semak"] = "Sefer Mitzvot Katan.{}".format(siman)
                    row_orig["Semak"] = smk.group()
                    cnt["Semak"] += 1
                else:
                    print u'***siman***' + smk.group()

            if row_dict:
                cnt["Rasag"] += 1
                row_dict["Rasag"] = re.search("(Sefer.*?\d*?):", seg.normal()).group(1)
                row_orig["Rasag"] = seg.normal()
                if with_orig:
                    dict_list.append(row_orig)
                dict_list.append(row_dict)
        toCsv(rasagCsvName, ["Rasag", "Sefer HaMitzvot", "Semag", "Semak"], dict_list)
        print cnt

示例#32

0

显示文件

def identify_errors(siman, pattern, sequence_code):
    errors = []
    matches = list(re.finditer(pattern, siman))
    previous = 0
    jump_ahead = False
    for i, match in enumerate(matches):
        if jump_ahead:
            jump_ahead = False
            continue
        try:
            current, following = getGematria(match.group(1)), getGematria(
                matches[i + 1].group(1))
        except IndexError:
            break
        if current - previous == 0:  # double tag
            previous = current
            continue

        elif current - previous == 2 and following - current == 1:  # missing tag
            error = {
                u'type': u'missing',
                u'from_sequence': sequence_code,
                u'value': current - 1,
            }
            if i == 0:
                error[u'range'] = (0, match.start())
            else:
                error[u'range'] = (matches[i - 1].end(), match.start())
            errors.append(error)
            previous = current
            continue

        elif following - previous == 1 and current - previous != 1:  # out of place
            errors.append({
                u'type': u'out_of_place',
                u'from_sequence': sequence_code,
                u'value': current,
                u'tag': match.group(),
                u'loc': match.start()
            })
            previous = following
            jump_ahead = True
        else:
            previous = current
    return errors

示例#33

0

显示文件

文件： mitzvot_link_scraper.py 项目： JonMosenkis/Sefaria-Data

def check_vav(st):
    if not st:
        return False
    if st[0] == u'ו':
        if is_hebrew_number(st[1:]):
            return getGematria(st[1:])
        else:
            return False
    return False

示例#34

0

显示文件

def scrape_wiki():
    url = u"https://he.wikipedia.org/wiki/%D7%9E%D7%A0%D7%99%D7%99%D7%9F_%D7%94%D7%9E%D7%A6%D7%95%D7%95%D7%AA_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%99%D7%A0%D7%95%D7%9A"

    page = requests.get(url)
    soup_body = BeautifulSoup(page.text, "lxml")
    tables = soup_body.select(".mw-parser-output > table")

    pairs = []
    links = []

    for table in tables:
        table_tr = table.select("tr")
        for col in table_tr:
            pairs.append((col.contents[1].text.strip(),
                          re.sub(u'</?td>', u'',
                                 col.contents[-1].text).strip()))

    for pair in pairs:
        if re.search(u'ספר|מספר', pair[0]):
            continue
        neg_pos = u"Negative Mitzvot" if re.search(
            u"לאו", pair[1]) else u'Positive Mitzvot'
        rambam = getGematria(re.sub(u'עשה|לאו', u'', pair[1]).strip())
        chinukh = getGematria(pair[0])
        print chinukh, rambam
        chinukh_simanlen = len(
            Ref(u'Sefer HaChinukh.{}'.format(chinukh)).all_segment_refs())
        print neg_pos
        link = ({
            "refs": [
                u'Sefer HaChinukh.{}.{}-{}'.format(chinukh, 1,
                                                   chinukh_simanlen),
                u'Mishneh Torah, {}.{}'.format(neg_pos, rambam)
            ],
            "type":
            "Sifrei Mitzvot",
            "auto":
            True,
            "generated_by":
            "chinukh_rambam_sfm_linker"  # _sfm_linker what is this parametor intended to be?
        })
        print link['refs']
        links.append(link)
        return links

示例#35

0

显示文件

文件： book_intro_parse.py 项目： smontagu/Sefaria-Data

def parse(file_name):
    chapter_number = regex.compile('@00([\u05d0-\u05ea]{1,2})')
    chapter_index = 1
    section, comment = [], []

    seven, shorashim, nine = [], [], []
    chapter_seven_intro = True

    with codecs.open(file_name, 'r', 'utf-8') as the_file:
        for each_line in the_file:

            if "@00" in each_line:
                if chapter_index != 7 and chapter_index != 9:
                    section.append(comment)
                    comment = []

                elif chapter_index == 7:
                    shorashim.append(comment)
                    seven.append(shorashim)
                    section.append(seven)
                    comment = []

                elif chapter_index == 9:
                    nine.append(comment)
                    section.append(nine)
                    comment = []

                match_object = chapter_number.search(each_line)
                chapter_index = util.getGematria(match_object.group(1))

            elif chapter_index != 7 and chapter_index != 9:
                each_line = clean_up(each_line)
                comment.append(each_line)

            elif chapter_index == 7:
                if "@01" in each_line:
                    if chapter_seven_intro:
                        seven.append(comment)
                        comment = []
                        chapter_seven_intro = False
                    else:
                        shorashim.append(comment)
                        comment = []
                else:
                    comment.append(each_line)

            elif chapter_index == 9:
                if "@01" in each_line:
                    nine.append(comment)
                    comment = []

                else:
                    comment.append(each_line)

    section.append(comment)
    return section

示例#36

0

显示文件

文件： semak_parser.py 项目： JonMosenkis/Sefaria-Data

def link_hg(hg_ja, hagahot_dict_lst, ja_raph):

    def link_hg_smk_or_raph(siman, smk_seg, hg, place_smk_hg, base_text):
        link = (
            {
                "refs": [
                    u"{} {}:{}".format(base_text, siman, smk_seg),
                    "Haggahot Chadashot on Sefer Mitzvot Katan {}:{}".format(siman, hg),  # really should be a ref link to the whole raph
                ],
                "type": "commentary",
                'inline_reference': {
                    'data-commentator': 'Haggahot Chadashot on Sefer Mitzvot Katan',
                    'data-order': place_smk_hg
                },
                "auto": True,
                "generated_by": "semak_parser"

            })
        return link

    # linking
    links = []
    smks = []
    raphs = []
    for dict in hagahot_dict_lst:
        smks += dict["smk"]
        raphs += dict["raph"]
    pts = 0
    ptr = 0
    link = None
    for dict in hagahot_dict_lst:
        # link all the haghot in a siman to the correct Semak segment
        pts_0 = 0
        ptr_0 = 0
        sim = getGematria(dict["siman"])
        # print sim
        for j, hgha in enumerate(hg_ja[sim-1]):
            smk_first = True
            if ptr < len(raphs) and smks[pts][0] == raphs[ptr][0]:
                if dict["raph"] and any([re.search(raphs[ptr][0], letter[0]) for letter in dict["raph"]]):
                    smk_first = False
            if smk_first and re.search(u"@11\({}\)".format(smks[pts][0]), hgha):  # pts < len(smks)
                link = link_hg_smk_or_raph(sim, smks[pts][1], j+1, pts_0+1, "Sefer Mitzvot Katan")
                pts += 1
                pts_0 += 1
            elif ptr < len(raphs) and re.search(u"@11\({}\)".format(raphs[ptr][0]), hgha):
                link = link_hg_smk_or_raph(sim, raphs[ptr][1], j+1, ptr_0+1, 'Haggahot Rabbeinu Peretz on Sefer Mitzvot Katan')
                ptr += 1
                ptr_0 += 1
            else:
                print u"error {}: something with the numbering is wrong...".format(dict["siman"])

            if link:
                links.append(link)
    return links

示例#37

0

显示文件

文件： punishments_parse.py 项目： smontagu/Sefaria-Data

def fill_in_missing_sections_and_updated_last(each_line, base_list, this_regex, filler, last_index):
    match_object = this_regex.search(each_line)
    string_of_mitzvot = match_object.group(1)
    string_of_mitzvot = string_of_mitzvot.strip()
    list_of_mitzvot = string_of_mitzvot.split()
    current_index = util.getGematria(list_of_mitzvot[0])
    diff = current_index - last_index
    while diff > 1:
        base_list.append(filler)
        diff -= 1
    return current_index

示例#38

0

显示文件

文件： positive_and_negative_parse.py 项目： JonMosenkis/Sefaria-Data

def fill_in_missing_sections_and_updated_last(each_line, base_list, this_regex, filler, last_index):
    match_object = this_regex.search(each_line)
    string_of_mitzvot = match_object.group(1)
    string_of_mitzvot = string_of_mitzvot.strip()
    list_of_mitzvot = string_of_mitzvot.split()
    current_index = util.getGematria(list_of_mitzvot[0])
    diff = current_index - last_index
    while diff > 1:
        base_list.append(filler)
        diff -= 1
    return current_index

示例#39

0

显示文件

文件： beer_hagolah_stars.py 项目： JonMosenkis/Sefaria-Data

def identify_star_locations(filename):
    def get_regex():
        partial_regexes = [u'@12([\u05d0-\u05ea]{1,3})', u'@11([\u05d0-\u05ea])', u'@11(\*)']
        names = [u'siman', u'seif', u'star']
        my_full_regexes = [u'(?P<{}>{})'.format(*i) for i in zip(names, partial_regexes)]
        return re.compile(u'|'.join(my_full_regexes))

    with codecs.open(filename, 'r', 'utf-8') as infile:
        lines = infile.readlines()

    siman, seif_index, seif_letter, num_stars = -1, -1, None, 0
    star_locations, current_star = [], {}
    line_regex = get_regex()

    for line in lines:
        line_data = line_regex.search(line)
        if line_data is None:
            continue

        elif line_data.lastgroup == u'star':
            num_stars += 1
            current_star = {
                u'siman_num': siman,
                u'preceding_index': seif_index,
                u'preceding_letter': seif_letter
            }

        else:
            if line_data.lastgroup == u'seif':
                seif_index += 1
                seif_letter = line_data.group(line_data.lastindex+1)

            elif line_data.lastgroup == u'siman':
                siman = getGematria(line_data.group(line_data.lastindex+1))
                seif_index = -1
                seif_letter = None

            else: raise LookupError(u"Expecting seif or siman, got {}".format(line_data.lastgroup))

            if num_stars >= 1:
                current_star[u'star_count'] = num_stars
                current_star[u'following_index'] = seif_index
                current_star[u'following_letter'] = seif_letter
                star_locations.append(current_star)
                num_stars = 0
    else:
        if num_stars >= 1:
            current_star[u'star_count'] = num_stars
            current_star[u'following_index'] = 0
            current_star[u'following_letter'] = None
            star_locations.append(current_star)

    return star_locations

示例#40

0

显示文件

文件： mitzvot_link_scraper.py 项目： JonMosenkis/Sefaria-Data

def rasag_exctractor(text):
    split = re.split(u"\s", text)
    simanim = []
    kind = None
    if re.search(u"(:?לאוין|לא תעשה)", split[0]):
            kind = u'Negative Commandments'
    elif re.search(u"(:?עשין|עשה)", split[0]):
            kind = u'Positive Commandments'
    for word in split[1:]:
        siman = getGematria(word)
        simanim.append(siman)
    return kind, simanim

示例#41

0

显示文件

def xmlify(filename):
    """
    create an xml representation of the text files
    :param filename: str name of file
    """
    with codecs.open(filename, 'r', 'utf-8') as infile:
        raw_rambam = infile.read()

    chap_index = [
        getGematria(i.group(1)) for i in re.finditer(
            ur'@00\u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})', raw_rambam)
    ]
    chapters = re.split(ur'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}',
                        raw_rambam)[1:]
    assert len(chap_index) == len(chapters)

    soup = BeautifulSoup(u'<root></root>', 'xml')
    for index, chapter in zip(chap_index, chapters):
        x_chapter = soup.new_tag('chapter', num=unicode(index))
        soup.root.append(x_chapter)

        v_indices = [
            getGematria(i.group(1))
            for i in re.finditer(ur'@22([\u05d0-\u05ea]{1,2})', chapter)
        ]
        verses = re.split(ur'@22[\u05d0-\u05ea]{1,2}', chapter)[1:]
        assert len(v_indices) == len(verses)

        for v_index, verse in zip(v_indices, verses):
            x_verse = soup.new_tag('verse', num=unicode(v_index))
            comments = verse.splitlines()
            for i, comment in enumerate(comments[1:]):
                x_comment = soup.new_tag('comment', num=unicode(i + 1))
                x_comment.append(comment)
                x_verse.append(x_comment)

            x_chapter.append(x_verse)
    with codecs.open('./xml/{}'.format(filename.replace('.txt', '.xml')), 'w',
                     'utf-8') as outfile:
        outfile.write(unicode(soup.prettify()))

示例#42

0

显示文件

文件： chinuch.py 项目： JonMosenkis/Sefaria-Data

def produce_parsed_data(filename):

    with codecs.open(filename, 'r', 'utf-8') as datafile:
        parsed = util.file_to_ja(3, datafile, (m_pattern, comment_pattern), nothing)

        datafile.seek(0)

        names = util.grab_section_names(m_pattern, datafile, 1)
        names = [int(util.getGematria(name)) for name in names]

    comp_text = util.simple_to_complex(names, parsed.array())
    parsed = util.convert_dict_to_array(comp_text)

    return parsed

示例#43

0

显示文件

    def _collect_hebrew_segments(self, soup):
        assert isinstance(soup, BeautifulSoup)
        he_reg = re.compile(u'^LMH')
        all_he_ps = soup.find_all('p', attrs={'class': he_reg})

        segments = []
        started = False
        chapter_reg = re.compile(ur'''\u05dc\u05d9\u05e7\u05d5\u05d8\u05d9 \u05de\u05d5\u05d4\u05e8[\u05f4"]\u05df\s(\u05ea\u05e0\u05d9\u05e0\u05d0\s)?\u05e1\u05d9\u05de\u05df\s(?P<chapter>[\u05d0-\u05ea"]{1,4})''')

        for he_p in all_he_ps:
            if he_p['class'] == u'LMH-styles_LMH-title':
                if not he_p.string:
                    raise AssertionError

                chapter_match = chapter_reg.match(he_p.string)
                if chapter_match:
                    if getGematria(chapter_match.group('chapter')) == self.number:
                        started = True

                    elif started:
                        break

            elif started:
                if re.search(u'Rashbam', he_p['class']):
                    continue
                segments.append(he_p)
            else:
                continue

        # if current segment ends on a Hebrew char, combine with the next segment
        bad_indices = []
        for i, (cur_segment, next_segment) in enumerate(zip(segments, segments[1:])):

            segment_text = cur_segment.text
            stripped_text = re.sub(u"[\u05b0-\u05C7]", u'', segment_text)  # strip nikkud
            if re.search(u'[\u05d0-\u05ea]\s*$', stripped_text):
                # merge this segment into this one
                bad_indices.append(i)
                for child in cur_segment.find_all(True):
                    child.unwrap()
                cur_segment.string = u'{} '.format(u' '.join(cur_segment.contents))
                next_segment.insert(0, cur_segment)
                cur_segment.unwrap()
            elif not segment_text:
                bad_indices.append(i)
        for i in reversed(bad_indices):
            segments.pop(i)

        assert len(segments) > 0
        self.hebrew_segments = [bleach.clean(s, tags=[], attributes={}, strip=True) for s in segments]

示例#44

0

显示文件

def test_expression(pattern):
    """
    test a regular expression object to see if how well it grabs all "springs" and "rivers"
    :param pattern: regular expression string
    :return: List of missed "rivers", expressed as a tuple: (spring, river)
    """
    regex = re.compile(pattern)
    split = get_text().splitlines()
    matches = filter(None, [regex.search(match) for match in split])
    issues = []
    print u'last_match: {}'.format(matches[-1].group())

    expected_spring, expected_river = 1, 1
    for match in matches:
        spring, river = getGematria(match.group(1)), getGematria(match.group(3))
        if spring > expected_spring:
            expected_river = 1
            expected_spring = spring
        if river > expected_river:
            while river > expected_river:
                issues.append((expected_spring, expected_river))
                expected_river += 1
        expected_river += 1
    return issues

示例#45

0

显示文件

文件： chinuch.py 项目： smontagu/Sefaria-Data

def check_chapters():
    with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch:
        test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})')

        index = 1

        for header in test.grab_each_header(capture_group=1):

            header = header.replace(u'"', u'')
            count = util.getGematria(header)

            if count != index:
                print util.numToHeb(index)
                index = count
            index += 1

示例#46

0

显示文件

文件： chinuch.py 项目： smontagu/Sefaria-Data

def produce_parsed_data(filename):

    with codecs.open(filename, 'r', 'utf-8') as datafile:
        parsed = util.file_to_ja([[[]]], datafile,
                                 (m_pattern, comment_pattern), nothing)

        datafile.seek(0)

        names = util.grab_section_names(m_pattern, datafile, 1)
        names = [int(util.getGematria(name)) for name in names]

    comp_text = util.simple_to_complex(names, parsed.array())
    parsed = util.convert_dict_to_array(comp_text)

    return parsed

示例#47

0

显示文件

文件： chinuch.py 项目： JonMosenkis/Sefaria-Data

def check_chapters():
    with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch:
        test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})')

        index = 1

        for header in test.grab_each_header(capture_group=1):

            header = header.replace(u'"', u'')
            count = util.getGematria(header)

            if count != index:
                print util.numToHeb(index)
                index = count
            index += 1

示例#48

0

显示文件

def fix_file(filepath, start_siman, test_mode=False):
    output_list = []
    with codecs.open(filepath, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    counter = 0
    for line in lines:
        match = re.match(u'^@11([\u05d0-\u05ea]{1,3})$', line)
        if match and getGematria(match.group(1)) == 1:
            output_list.append(u'@00{}\n'.format(
                numToHeb(counter + start_siman)))
            counter += 1
        output_list.append(line)
    if test_mode:
        filepath = re.sub(ur'\.txt$', u'_test.txt', filepath)
    with codecs.open(filepath, 'w', 'utf-8') as fp:
        fp.writelines(output_list)

示例#49

0

显示文件

文件： parse_boaz.py 项目： smontagu/Sefaria-Data

def align_boaz_chapters(source_file, simple_array):
    """
    Boaz does not guarantee text for every chapter. Using the util library, this method will pad the parsed text with
     empty sections as necessary to accurately represent the data.
    :param source_file: File from which to derive chapter numbers
    :param simple_array: A "naive" parse of the data structured as a nested list.
    :return: Nested array, with proper padding to account for empty chapters.
    """

    # grab each chapter number from the source file
    chapters = [
        util.getGematria(n) for n in util.grab_section_names(
            u'@00פרק ([\u05d0-\u05ea]{1,2})', source_file, 1)
    ]
    as_dict = util.simple_to_complex(chapters, simple_array)
    return util.convert_dict_to_array(as_dict)

示例#50

0

显示文件

def parser(name):
    with codecs.open('{}.txt'.format(name), 'r', 'utf-8') as infile:
        lines = infile.readlines()
    parsed_text = JaggedArray([[[]]])
    links = []
    chapter, mishnah, comment = -1, -1, -1
    for line in lines:
        if re.match(ur'@00\u05e4\u05e8\u05e7', line) is not None:
            chapter += 1
            comment = -1
            continue

        elif re.match(ur'@22', line) is not None:
            mishnah = getGematria(re.match(ur'@22([\u05d0-\u05ea]{1,2})', line).group(1)) - 1
            comment = -1
            continue

示例#51

0

显示文件

def parse_and_post(file_name):
    mishna_number_regex = regex.compile(u'([\u05d0-\u05ea]{1,3})')
    rb_yonah_on_avot, perek_level_list, mishna_level_list = [], [], []
    new_perek, first_perek = True, True
    last_mishna = 0
    with codecs.open(file_name, 'r', 'utf-8') as the_file:
        for each_line in the_file:

            if "@00" in each_line:
                if not first_perek:
                    perek_level_list.append(mishna_level_list)
                    rb_yonah_on_avot.append(perek_level_list)
                    perek_level_list, mishna_level_list = [], []
                    new_perek = True

                else:
                    first_perek = False

            elif "@22" in each_line:
                if not new_perek:
                    perek_level_list.append(mishna_level_list)
                    mishna_level_list = []

                    match_object = mishna_number_regex.search(each_line)
                    mishna_number = util.getGematria(match_object.group(1))
                    diff = mishna_number - last_mishna
                    while diff > 1:
                        perek_level_list.append([])
                        diff -= 1

                    last_mishna = mishna_number

                else:
                    new_perek = False
                    last_mishna = 1

            else:
                divided_string = each_line.split(u'~')
                for line in divided_string:
                    line = line.strip()
                    if line:
                        line = clean_up_string(line)
                        mishna_level_list.append(line)

        rb_yonah_on_avot.append(perek_level_list)
        post_the_text(rb_yonah_on_avot)
    return rb_yonah_on_avot

示例#52

0

显示文件

文件： rb_yonah_functions.py 项目： JonMosenkis/Sefaria-Data

def parse_and_post(file_name):
    mishna_number_regex = regex.compile(u'([\u05d0-\u05ea]{1,3})')
    rb_yonah_on_avot, perek_level_list, mishna_level_list = [], [], []
    new_perek, first_perek = True, True
    last_mishna = 0
    with codecs.open(file_name, 'r', 'utf-8') as the_file:
        for each_line in the_file:

            if "@00" in each_line:
                if not first_perek:
                    perek_level_list.append(mishna_level_list)
                    rb_yonah_on_avot.append(perek_level_list)
                    perek_level_list, mishna_level_list = [], []
                    new_perek = True

                else:
                    first_perek = False

            elif "@22" in each_line:
                if not new_perek:
                    perek_level_list.append(mishna_level_list)
                    mishna_level_list = []

                    match_object = mishna_number_regex.search(each_line)
                    mishna_number = util.getGematria(match_object.group(1))
                    diff = mishna_number - last_mishna
                    while diff > 1:
                        perek_level_list.append([])
                        diff -= 1

                    last_mishna = mishna_number

                else:
                    new_perek = False
                    last_mishna = 1

            else:
                divided_string = each_line.split(u'~')
                for line in divided_string:
                    line = line.strip()
                    if line:
                        line = clean_up_string(line)
                        mishna_level_list.append(line)

        rb_yonah_on_avot.append(perek_level_list)
        post_the_text(rb_yonah_on_avot)
    return rb_yonah_on_avot

示例#53

0

显示文件

文件： gra_functions.py 项目： JonMosenkis/Sefaria-Data

def parse():
    mishna_number_regex = regex.compile(u'([\u05d0-\u05ea]{1,3})')
    gra_on_avot, perek_level_list, mishna_level_list = [], [], []
    new_perek, first_perek = True, True
    last_mishna = 0
    with codecs.open('gra_on_avot.txt', 'r', 'utf-8') as the_file:
        for each_line in the_file:

            if "@00" in each_line:
                if not first_perek:
                    perek_level_list.append(mishna_level_list)
                    gra_on_avot.append(perek_level_list)
                    perek_level_list, mishna_level_list = [], []
                    new_perek = True

                else:
                    first_perek = False

            elif "@22" in each_line:
                if not new_perek:
                    perek_level_list.append(mishna_level_list)
                    mishna_level_list = []

                    match_object = mishna_number_regex.search(each_line)
                    mishna_number = util.getGematria(match_object.group(1))
                    diff = mishna_number - last_mishna
                    while diff > 1:
                        perek_level_list.append([])
                        diff -= 1

                    last_mishna = mishna_number

                else:
                    new_perek = False
                    last_mishna = 1

            else:
                each_line = clean_up_string(each_line)
                mishna_level_list.append(each_line)


        gra_on_avot.append(perek_level_list)

    return gra_on_avot

示例#54

0

显示文件

文件： gra_functions.py 项目： smontagu/Sefaria-Data

def parse():
    mishna_number_regex = regex.compile(u'([\u05d0-\u05ea]{1,3})')
    gra_on_avot, perek_level_list, mishna_level_list = [], [], []
    new_perek, first_perek = True, True
    last_mishna = 0
    with codecs.open('gra_on_avot.txt', 'r', 'utf-8') as the_file:
        for each_line in the_file:

            if "@00" in each_line:
                if not first_perek:
                    perek_level_list.append(mishna_level_list)
                    gra_on_avot.append(perek_level_list)
                    perek_level_list, mishna_level_list = [], []
                    new_perek = True

                else:
                    first_perek = False

            elif "@22" in each_line:
                if not new_perek:
                    perek_level_list.append(mishna_level_list)
                    mishna_level_list = []

                    match_object = mishna_number_regex.search(each_line)
                    mishna_number = util.getGematria(match_object.group(1))
                    diff = mishna_number - last_mishna
                    while diff > 1:
                        perek_level_list.append([])
                        diff -= 1

                    last_mishna = mishna_number

                else:
                    new_perek = False
                    last_mishna = 1

            else:
                each_line = clean_up_string(each_line)
                mishna_level_list.append(each_line)

        gra_on_avot.append(perek_level_list)

    return gra_on_avot

示例#55

0

显示文件

文件： semak_parser.py 项目： YairRand/Sefaria-Data

def regs_devide(lines, regs, eof=None):
    reg = regs[0]
    ja = []
    letter = []
    siman = []
    for line in lines:
        comb_letter = ' '.join(letter)
        if re.search(reg, line) or (eof and re.search(eof, line)):
            siman.append(comb_letter)
            letter = []
            if re.search(reg, line):
                gim = getGematria(re.search(reg, line).group(1))
            if gim == 1 or (eof and re.search(eof, line)):
                ja.append(siman)
                if siman == ['']:
                    ja.pop()
                siman = []
        letter.append(line)
    return ja

示例#56

0

显示文件

文件： parse_lm.py 项目： Sefaria/Sefaria-Data

    def _set_he_section_transitions(self):
        transition_list = []
        current_segment = 1

        for seg_num, segment in enumerate(self._hebrew_segments):
            match = re.match(u'^([\u05d0-\u05d8]|[\u05d9-\u05dc][\u05d0-\u05d8]?|\u05d8[\u05d5\u05d6])\.\s', segment)
            if not match:
                continue
            next_segment = getGematria(match.group(1))

            if next_segment == 1:
                pass
            elif next_segment - current_segment != 1:
                print "Bad hebrew section transition found in chapter {}".format(self.number)
                raise AssertionError
            else:
                transition_list.append(seg_num)
                current_segment = next_segment

        self._he_section_transitions = tuple(transition_list)