예제 #1
0
def parse_hagahot_by_letter(filename):
    def cleaner(my_text):
        replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u''}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'\$', line) and not line.isspace():
            line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    try:
        ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True,  grab_all=[True, True], group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'
    new_ja = regs_devide(cleaned, regs, u'(נשלם מלאכת שבעת הימים)')
    ja_to_xml(new_ja, ['siman', 'letter', 'segments'], 'hagahot_letters.xml')

    return new_ja
예제 #2
0
def parse_hagahot_by_letter(filename):
    def cleaner(my_text):
        replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@77': u''}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'\$', line) and not line.isspace():
            line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    try:
        ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True,  grab_all=[True, True], group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'
    new_ja = regs_devide(cleaned, regs, u'(נשלם מלאכת שבעת הימים)')
    ja_to_xml(new_ja, ['siman', 'letter', 'segments'], 'hagahot_letters.xml')

    return new_ja
예제 #3
0
def parse_he(filename):
    """
    :returns a dictionary, key: name of book, value: JaggadArray obj of the ja for the book
    """
    replace_dict = {
        u'@(11|44|99)': u'<b>',
        u'@(33|55)': u'</b>',
        ur'@22\(([\u05d0-\u05ea]{1,3})\)': u'',
        ur'@(22|77)': u''
    }

    def cleaner(my_text):
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)

        return new

    regs = [
        ur'@00(?P<gim>)', ur'@02(?P<gim>[\u05d0-\u05ea]{1,3})',
        ur'@22\((?P<gim>[\u05d0-\u05ea]{1,3})\)'
    ]  # ,ur'@77'
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of Parasha start with @01
    cleaned = []
    dh_list = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
        if starting and not re.search(u'@01', line) and not line.isspace():
            dh_recognize = re.compile(ur'@11(.*?)@33')
            if dh_recognize.search(line):
                dh_list.append(dh_recognize.search(line).group(1))
            line = re.sub(dh_recognize, ur'#<b>\1</b>', line)
            line = re.split(ur'#', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                cleaned.extend(line)

    tt_ja = file_to_ja_g(4,
                         cleaned,
                         regs,
                         cleaner,
                         gimatria=True,
                         group_name='gim',
                         grab_all=[False, False, False]).array()
    Pentateuch = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']
    parsed_texts = dict({book: ja for book, ja in zip(Pentateuch, tt_ja)})

    for book, ja in zip(Pentateuch, tt_ja):
        ja_to_xml(ja, ['perek', 'pasuk', 'comment'], 'tur_{}.xml'.format(book))

    # for str in  dh_list:
    #     print str
    return parsed_texts
예제 #4
0
def test_file_to_ja_g():
    data = StringIO('''@22א\nfoo\nbar\n@22ג\nhello\nworld''')
    ja = util.file_to_ja_g(2, data, [r'@22(?P<gim>[\u05d0-\u05ea])'], lambda x: [c.rstrip() for c in x], True)
    assert ja.array() == [
        ['foo', 'bar'],
        [],
        ['hello', 'world']
    ]
예제 #5
0
def parse_semak(filename):
    def cleaner(my_text):
        replace_dict = {
            u'@11(.*?)@12': ur'<b>\1</b>',
            u'@33(.*?)@34': ur'<b>\1</b>',
            u'@66(.*?)@67': ur'\1',
            u"@44": u""
        }

        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@22(?P<gim>[\u05d0-\u05ea]{1,3})']  #, u'@(11|23|33)(?P<gim>)']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    letter_section = []
    alt_day = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if re.search(u'@00', line):
            alt_day.append(line_num)
        if not re.search(u'@00', line) and not line.isspace():
            if re.search(u'@22', line):
                line = re.split(u'(@22[\u05d0-\u05ea]{1,3})', line)
                if isinstance(line, basestring):
                    cleaned.append(line)
                else:
                    [cleaned.append(st) for st in line if st]
            else:
                cleaned.append(line)
    alt_day.append(len(lines))
    print alt_day
    try:
        smk_ja = file_to_ja_g(2,
                              cleaned,
                              regs,
                              cleaner,
                              gimatria=True,
                              grab_all=[False, True, True],
                              group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'
    ja_to_xml(smk_ja, ['letter', 'segments'], 'smk.xml')

    return smk_ja
예제 #6
0
def parse_smk(filename):
    '''
    :param filename: smk source txt file
    :return: JA obj smk parsed to depth 2 [siman, segment] (including a citation segment at the top of each siman)
    '''

    def cleaner(my_text):
        replace_dict = {u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>',
                        u'@66(.*?)@67': ur'\1'}  # , u'@55[\u05d0-\u05ea]{1,3}' : u'<i-tags = >'}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@22(?P<gim>[\u05d0-\u05ea]{1,3})']  # , u'@(11|23|33)(?P<gim>)']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    letter_section = []
    alt_day = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if re.search(u'@00', line):
            alt_day.append(line_num)
        if not re.search(u'@00', line) and not line.isspace():
            if re.search(u'@22', line):
                line = re.split(u'(@22[\u05d0-\u05ea]{1,3})', line)
                if isinstance(line, basestring):
                    cleaned.append(line)
                else:
                    [cleaned.append(st) for st in line if st]
            else:
                cleaned.append(line)
    alt_day.append(len(lines))
    print alt_day
    try:
        smk_ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[False, True, True],
                              group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'

    ja_to_xml(smk_ja, ['letter', 'segments'], 'smk.xml')

    return JaggedArray(smk_ja)
예제 #7
0
def parse_Raph(filename):
    def cleaner(my_text):
        replace_dict = {
            u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'',
            u'@33': u''
        }  #{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [
        ur'@77(?P<gim>[\u05d0-\u05ea]{0,3})',
        ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})'
    ]  # (?P<gim>[\u05d0-\u05ea]{1,3})
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    letter_section = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line) and not line.isspace():
            line = re.split(u'(@(?:77|11)[\u05d0-\u05ea]{0,3})', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line
                 if st]  #(st and not re.search(u'@(77)', st))]
            # else:
            #     cleaned.append(line)
    try:
        ja = file_to_ja_g(3,
                          cleaned,
                          regs,
                          cleaner,
                          gimatria=True,
                          grab_all=[False, False, True],
                          group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'

    ja_to_xml(ja, ['page', 'letter', 'segments'], 'raph.xml')

    return ja
예제 #8
0
def parse_hagahot(filename, smk_ja, raph_ja):
    '''

    :param filename: hagahot source txt file
    :param smk_ja: smk JA obj [siman, segment]
    :param raph_ja: raph JA obj [siman, letter]
    :return: JA obj
    '''

    ja_hagahot = []
    def cleaner(my_text):
        #todo: deal with @44 and @00 (@00 maybe should be only in smk base text? - ask Shmuel)
        replace_dict = {u'@11\([\u05d0-\u05ea]{0,3}\)': u'', u'@(33|77|88|99)': u'', u'@55(.*?)@66': u'<b>\1</b>'}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11\((?P<gim>[\u05d0-\u05ea]{1,3})\)']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line):
            line = re.split(u'(@11\([\u05d0-\u05ea]{0,3}\))', line)
            if isinstance(line, basestring) and line != u'':
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    try:
        ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True, grab_all=[False], group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'
    ja_to_xml(ja, ['siman', 'letter'], 'hagahot_letters_25.xml') #, 'segments'

    # for hghds in
    return JaggedArray(ja_hagahot)
예제 #9
0
def parse_Raph(filename):
    def cleaner(my_text):
        replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}
        new = []
        for line in my_text:
            line = multiple_replace(line,replace_dict,using_regex=True)
            new.append(line)
        return new

    regs = [ur'@77(?P<gim>[\u05d0-\u05ea]{0,3})', ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})']  # (?P<gim>[\u05d0-\u05ea]{1,3})
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    letter_section = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line) and not line.isspace():
            line = re.split(u'(@(?:77|11)[\u05d0-\u05ea]{0,3})', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]#(st and not re.search(u'@(77)', st))]
            # else:
            #     cleaned.append(line)
    try:
        ja = file_to_ja_g(3, cleaned, regs, cleaner, gimatria=True,  grab_all=[False, False, True], group_name='gim').array()
    except AttributeError:
        print 'there are more regs then levels...'

    ja_to_xml(ja, ['page', 'letter', 'segments'], 'raph.xml')

    return ja
예제 #10
0
for the_file in [x for x in os.listdir(folder) if "xml" not in x]:
    name = the_file.replace(".txt", "").split("-")[2]
    sefer = the_file.split("-")[1]

    if sefer == u"ספר קרבנות":
        sefer = u"ספר קורבנות"
    if sefer == u"ספר קנין":
        sefer = u"ספר קניין"

    sefarim.add(sefer)
    if name in processed:
        # Skip second version of Nashim
        continue
    file_path = os.path.join(folder, the_file)
    with codecs.open(file_path, "r", "utf-8") as infile:
        j = file_to_ja_g(3, infile, regexes, clean_segments, gimatria=True)
        processed[name] = {"cat": sefer, "text": j.array()}


        ja_to_xml(j.array(), ["Chapter","Halacha","Comment"], file_path.replace("txt","xml"))


processed[u"הלכות תפילה וברכת כהנים"] = {
    "cat": processed[u"הלכות תפלה"]["cat"],
    "text": processed[u"הלכות תפלה"]["text"][:13] + processed[u"הלכות נשיאת כפים"]["text"][13:]
}
del processed[u"הלכות תפלה"]
del processed[u"הלכות נשיאת כפים"]


processed[u"הלכות תפילין ומזוזה וספר תורה"] = {
    return [clean_segment(s) for s in segments if clean_segment(s)]

processed = {}
sefarim = set()
regexes = [chapter_regex, halacha_regex]
for the_file in [x for x in os.listdir(folder) if "xml" not in x]:
    name = the_file.replace(".txt", "").split("-")[2]
    sefer = the_file.split("-")[1]

    sefarim.add(sefer)
    if name in processed:
        # Skip second version of Nashim
        continue
    file_path = os.path.join(folder, the_file)
    with codecs.open(file_path, "r", "utf-8") as infile:
        j = file_to_ja_g(3, infile, regexes, clean_segments, gimatria=True)
        processed[name] = {"cat": sefer, "text": j.array()}


        ja_to_xml(j.array(), ["Chapter","Halacha","Comment"], file_path.replace("txt","xml"))

processed[u"הלכות שופר וסוכה ולולב"] = {
    "cat": processed[u"הלכות שופר"]["cat"],
    "text": processed[u"הלכות שופר"]["text"][:3] + processed[u"הלכות סוכה"]["text"][3:6] + processed[u"הלכות לולב"]["text"][6:]
}
del processed[u"הלכות שופר"]
del processed[u"הלכות סוכה"]
del processed[u"הלכות לולב"]

processed[u"הלכות מגילה וחנוכה"] = {
    "cat": processed[u"הלכות מגילה"]["cat"],