Пример #1
0
def run_parser(text):
    parsed_text=[]
    cur_chapter = 1
    cur_verse = 1
    text=re.sub(ur'@20.{6}','',text)
    chapters = re.split(ur'@10([^@]*)', text)
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        if chapter_num.strip() != '':
            chapter_num = re.sub("[\(,\)]", "", chapter_num)
            cur_chapter = hebrew.heb_string_to_int(chapter_num.strip())
            parsed_chapter = []
            expand_list_assign(parsed_text, cur_chapter - 1, parsed_chapter)
            psukim = re.split(ur'@11([^@]*)', chapter)
            for pasuk_num, pasuk in zip(psukim[1::2], psukim[2::2]):
                 if pasuk.strip() != '':
                    parsed_verse = []
                    pasuk_num = re.sub("[\(,\)]", "", pasuk_num)
                    cur_verse = hebrew.heb_string_to_int(pasuk_num.strip())
                    expand_list_assign(parsed_chapter, cur_verse - 1, parsed_verse)
                    DH = pasuk.split(ur'@12')[1:]
                    for dibur_hamatchil in DH:
                        comments = dibur_hamatchil.split('@33')[1:]
                        comment1=""
                        for comment in comments:
                            if comment.strip() != '':
                                key = re.split(ur'@00', comment)
                                if len(key) > 1:
                                    comment = '<b>' + key[0] + '</b>' + key[1]
                                    comment1 = comment1 + comment
                        parsed_verse.append(comment1)
    return parsed_text
def run_parser(text):
    parasha_num = 0
    cur_chapter = 1
    cur_verse = 1
    chapters = re.split(ur'@09([^@]*)', text)
    parashot = [[], [], [], [], []]
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        if chapter_num.strip() != '':
            cur_chapter = hebrew.heb_string_to_int(chapter_num.strip())
            names = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']
            parsed_chapter = []
            if cur_chapter == 1:
                parasha_num += 1
                parasha_name = names[parasha_num - 1]
            expand_list_assign(parashot[parasha_num - 1], cur_chapter - 1, parsed_chapter)
            psukim = re.split(ur'@97([^@]*)', chapter)
            for pasuk_num, pasuk in zip(psukim[1::2], psukim[2::2]):
                if pasuk.strip() != '':
                    parsed_verse = []
                    pasuk_num = re.sub("[\(,\)]", "", pasuk_num)
                    cur_verse = hebrew.heb_string_to_int(pasuk_num.strip())
                    expand_list_assign(parsed_chapter, cur_verse - 1, parsed_verse)
                comments = pasuk.split('@98')[1:]
                for comment in comments:
                    if comment.strip() != '':
                        key = re.split(ur'@87', comment)
                        if len(key) > 1:
                            comment = '<b>' + key[0] + '</b>' + key[1]
                        parsed_verse.append(comment)
    return parashot
Пример #3
0
def run_parser(text):
    parsed_text = []
    cur_chapter = 1
    cur_verse = 1
    text = re.sub(ur"@20.{6}", "", text)
    chapters = re.split(ur"@10([^@]*)", text)
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        if chapter_num.strip() != "":
            chapter_num = re.sub("[\(,\)]", "", chapter_num)
            cur_chapter = hebrew.heb_string_to_int(chapter_num.strip())
            parsed_chapter = []
            expand_list_assign(parsed_text, cur_chapter - 1, parsed_chapter)
            psukim = re.split(ur"@11([^@]*)", chapter)
            for pasuk_num, pasuk in zip(psukim[1::2], psukim[2::2]):
                if pasuk.strip() != "":
                    parsed_verse = []
                    pasuk_num = re.sub("[\(,\)]", "", pasuk_num)
                    cur_verse = hebrew.heb_string_to_int(pasuk_num.strip())
                    expand_list_assign(parsed_chapter, cur_verse - 1, parsed_verse)
                comments = pasuk.split("@33")[1:]
                for comment in comments:
                    if comment.strip() != "":
                        key = re.split(ur"@00", comment)
                        if len(key) > 1:
                            comment = "<b>" + key[0] + "</b>" + key[1]
                        parsed_verse.append(comment)
    return parsed_text
Пример #4
0
def run_parser():
    print "running parser"
    parsed_text = []
    cur_chapter = 1
    cur_verse = 1
    #regex = re.compile(ur'@11(.*)@22',re.UNICODE)
    with open("source/Radak_on_Genesis.txt", 'r') as filep:
        file_text = filep.read()
        ucd_text = unicode(file_text, 'utf-8').strip()

    #get rid of some unhelpful markup
    ucd_text = re.sub(ur'@11(.*?)@33', ur'@55\1', ucd_text)
    ucd_text = re.sub(ur'@00([^@]*)\n', '', ucd_text)
    ucd_text = ucd_text.replace(u'@44(שם)@55', u'(שם)')
    #split according to chapter. Will also include the chapter letters in the results.
    chapters = re.split(ur'@22([^@]*)', ucd_text)
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        if chapter_num.strip() != '':
            cur_chapter = hebrew.heb_string_to_int(chapter_num.strip())
            parsed_chapter = []
            expand_list_assign(parsed_text, cur_chapter - 1, parsed_chapter)
        #now split on verse numbers, capturing the verse numbers as well
        verses = re.split(ur'@44\(([\u0590-\u05ea]{1,2})\)', chapter)
        for verse_num, verse in zip(verses[1::2], verses[2::2]):
            if verse_num.strip() != '':
                parsed_verse = []
                cur_verse = hebrew.heb_string_to_int(verse_num.strip())
                expand_list_assign(parsed_chapter, cur_verse - 1, parsed_verse)
            comments = verse.split('@55')[1:]
            for comment in comments:
                if comment.strip() != '':
                    parsed_verse.append(comment)

    pretty_print(parsed_text)
    save_parsed_text(parsed_text)
Пример #5
0
def run_parser(text):
    parasha_num = 0
    cur_chapter = 1
    cur_verse = 1
    chapters = re.split(ur'@09([^@]*)', text)
    parashot = [[], [], [], [], []]
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        if chapter_num.strip() != '':
            cur_chapter = hebrew.heb_string_to_int(chapter_num.strip())
            names = [
                'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy'
            ]
            parsed_chapter = []
            if cur_chapter == 1:
                parasha_num += 1
                parasha_name = names[parasha_num - 1]
            expand_list_assign(parashot[parasha_num - 1], cur_chapter - 1,
                               parsed_chapter)
            psukim = re.split(ur'@97([^@]*)', chapter)
            for pasuk_num, pasuk in zip(psukim[1::2], psukim[2::2]):
                if pasuk.strip() != '':
                    parsed_verse = []
                    pasuk_num = re.sub("[\(,\)]", "", pasuk_num)
                    cur_verse = hebrew.heb_string_to_int(pasuk_num.strip())
                    expand_list_assign(parsed_chapter, cur_verse - 1,
                                       parsed_verse)
                comments = pasuk.split('@98')[1:]
                for comment in comments:
                    if comment.strip() != '':
                        key = re.split(ur'@87', comment)
                        if len(key) > 1:
                            comment = '<b>' + key[0] + '</b>' + key[1]
                        parsed_verse.append(comment)
    return parashot
Пример #6
0
def search2(parsed, part):
    for k, seif in enumerate(parsed):
        for i, pasuk in enumerate(seif):
            found = re.finditer(ur'@44[\[\(](.*?)[\]\)]@55(.*?)\.', pasuk)
            for find in found:
                daf = find.group(1)
                if daf.strip().split(' ')[0] == u"מנחות" and len(
                        daf.strip().split(' ')) < 6:
                    if len(daf.strip().split(' ')) == 3:
                        daf = daf.strip().split(' ')[2]
                    elif len(daf.strip().split(' ')) == 2:
                        daf = daf.strip().split(' ')[1]
                    if daf[-1] == ".":
                        amud = "a"
                    elif daf[-1] == ":":
                        amud = "b"
                    daf_num = hebrew.heb_string_to_int(daf[0:-1])
                    #print daf_num, amud
                elif daf.strip().split(' ')[0] == u"דף":
                    daf = daf.strip().split(' ')[1]
                    if daf[-1] == ".":
                        amud = "a"
                    elif daf[-1] == ":":
                        amud = "b"
                    daf_num = hebrew.heb_string_to_int(daf[0:-1])
                    #print daf_num, amud
                elif ur"שם" not in daf and ur"דף" in daf:
                    #print daf
                    pass
                else:
                    pass
                    #print daf
                text = find.group(2)

                try:
                    print str(k + 1), str(i + 1), daf_num, amud
                    found = matchobj(daf_num, amud, text)
                    line = found[1][0]
                    if line > 0:
                        #print "Rosh on {}".format(masechet), daf_num, amud, found[1][0], str(i+1), ",", str(j+1) + ",", str(k+1)
                        talmud = "{}".format(masechet) + "." + str(
                            daf_num) + amud + "." + str(line)
                        roash = "Rosh on {}".format(
                            masechet) + ", " + part + "." + str(
                                k + 1) + "." + str(1)
                        links.append(makeLink(talmud, roash))
                except Exception as e:
                    print e
Пример #7
0
def parse(text):
    old_num =0
    dibbur =""
    #simanim = re.finditer(ur'(@[0-9][0-9])\n?(@[0-9][0-9])(.*\n*)', text)
    simanim = re.split("@77",text)
    bayit_chadash = []
    perek =[]
    i=1
    for siman in simanim:
        simans = re.finditer("@11(.*)@33(.*)",siman)

        for s in simans:
            dibbur ="(" + str(i) + ")" +  "<b>" + s.group(1) + '</b>'+ s.group(2)
            print i
            i = i +1
        if "@22" not in siman:

            perek.append(dibbur)
        elif "@22" in siman:
             #i = 1
             num = re.findall("@22(.*)",siman) [0]
             new_num = hebrew.heb_string_to_int(num.strip())
             #print new_num
             if new_num - old_num != 1:
                 for k in range(1,new_num - old_num):
                    bayit_chadash.append([])
             old_num= new_num
             bayit_chadash.append(perek)
             perek =[]

             perek.append(dibbur)
             i=1
    bayit_chadash.append(perek)
    #print len(bayit_chadash)
    return  bayit_chadash[1:len(bayit_chadash)]
Пример #8
0
def search(text, shas):
    for i, seif in enumerate(text):
        for j, siman in enumerate(seif):
            if siman.endswith(ur'5 '):
                print "yes"
            linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman)
            if '@44' not in siman[0:10] and len(siman) > 8:
                start = re.sub('([\[\*\]]|@..|#)',"",siman)
                start_of_siman = re.split(" ", start)
                matching(start_of_siman, shas, i, j, index, daf, amud)
            for match in linked:
                lookfor = match.group(2)
                tagged = re.split(" ", lookfor.strip())
                daf_amud = re.split(ur' ', match.group(1).strip())
                daf =  hebrew.heb_string_to_int(daf_amud[1])
                amud = daf_amud[2]
                index = ((daf-2)*2)+1
                if amud[2].strip() == ur'א':
                    amud = 'a'
                    index = index - 1
                else:
                    amud = 'b'
                if len(lookfor) < 5:
                    print "short", daf, amud
                    break
                else:
                    matching(tagged, shas, i, j, index, daf, amud)
Пример #9
0
def search2(parsed):
    for i,perek in enumerate(parsed):
       for j, pasuk in enumerate(perek):
           for k, seif in enumerate(pasuk):
               found =  re.finditer(ur'@44\[דף(.*?)\](.*?)\.', seif)
               for find in found:
                   daf = find.group(1)
                   text =  find.group(2)
                   if daf.strip().split(" ")[1] == u'ע"א':
                       amud = 'a'
                   elif daf.strip().split(" ")[1] == u'ע"ב':
                       amud = 'b'

                   new_daf = daf.strip().split(" ")[0]
                   try:
                       daf_num = hebrew.heb_string_to_int(new_daf)
                       #print str(daf_num) + amud
                       found = matchobj(daf_num, amud, text)
                       line = found[1][0]
                       if line >0:
                        #print "Rosh on {}".format(masechet), daf_num, amud, found[1][0], str(i+1), ",", str(j+1) + ",", str(k+1)
                        talmud = "{}".format(masechet) +  "." + str(daf_num) + amud + "." + str(line)
                        roash = "Rosh on {}".format(masechet) +"." + str(i+1) + "." + str(j+1) + "." + str(k+1)

                        links.append(link(talmud,roash))
                   except KeyError:
                       pass
Пример #10
0
def parse(text):
    old_num = 0
    simanim = re.finditer(ur'@22\n*(.*)\n*@11(\n?.*)', text)
    for siman in simanim:
        #print siman.group(1)
        new_num = hebrew.heb_string_to_int(siman.group(1).strip())
        if new_num - old_num != 1:
            print siman.group(1)
            print new_num
        old_num = new_num
    print new_num
    bet_yosef = []
    simanim = re.split("@22", text)
    for siman in simanim:
        seif = []
        dh = re.split("@66", siman)
        i = 1
        for text in dh:
            bolded_dh = re.finditer(ur'@11(.*)@33(.*)', text)
            for bold in bolded_dh:
                new_dh = "[" + str(i) + "]" + "<b>" + bold.group(
                    1) + '</b>' + bold.group(2)
                seif.append(new_dh)
                i += 1
        bet_yosef.append(seif)
    print "length bet yosef", len(bet_yosef)
    #print bet_yosef[len(bet_yosef)-1][0]
    if len(bet_yosef[0]) < 2:
        return bet_yosef[1:len(bet_yosef) - 1]
    else:
        return bet_yosef
Пример #11
0
def parse(text):
     old_num =0
     simanim = re.finditer(ur'@22\n*(.*)\n*@11(\n?.*)', text)
     for siman in simanim:
        si =  siman.group(1)
        num = re.split(" ", si.strip())[1]
        num=re.sub("'","",num)
        new_num = hebrew.heb_string_to_int(num.strip())
        if new_num - old_num !=1:
            print new_num
        old_num = new_num
     prisha=[]
     simanim = re.split("@22", text)
     for siman in simanim:
         seif =[]
         dh = re.split(ur"\(\S\S?\)]", siman)
         print dh[0]
         i = 1
         for text in dh:
             bolded_dh = re.finditer(ur'@11(.*)@33(.*)',text)
             for bold in bolded_dh:
                #new_dh = "{" + str(i) +"}" + "<b>"  + bold.group(1) + '</b>' + bold.group(2)
                new_dh =  "<b>"  + bold.group(1) + '</b>' + bold.group(2)
                seif.append(new_dh)
                i += 1
         prisha.append(seif)
     print "length prisha", len(prisha)
     #print bet_yosef[len(bet_yosef)-1][0]
     if len(prisha[0])<2:
         return prisha[1:len(prisha)-1]
     else:
        return prisha
Пример #12
0
def search(text, shas):
    for i, seif in enumerate(text):
        for j, siman in enumerate(seif):
            if siman.endswith(ur'5 '):
                print "yes"
            linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman)
            if '@44' not in siman[0:10] and len(siman) > 8:
                start = re.sub('([\[\*\]]|@..|#)', "", siman)
                start_of_siman = re.split(" ", start)
                matching(start_of_siman, shas, i, j, index, daf, amud)
            for match in linked:
                lookfor = match.group(2)
                tagged = re.split(" ", lookfor.strip())
                daf_amud = re.split(ur' ', match.group(1).strip())
                daf = hebrew.heb_string_to_int(daf_amud[1])
                amud = daf_amud[2]
                index = ((daf - 2) * 2) + 1
                if amud[2].strip() == ur'א':
                    amud = 'a'
                    index = index - 1
                else:
                    amud = 'b'
                if len(lookfor) < 5:
                    print "short", daf, amud
                    break
                else:
                    matching(tagged, shas, i, j, index, daf, amud)
Пример #13
0
def search1(text, shas):
    for k, perek in enumerate(text):
        for i, seif in enumerate(perek):
            for j, siman in enumerate(seif):
                if siman.endswith(ur'5 '):
                    print "yes"
                linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))',
                                     siman)
                if '@44' not in siman[0:10] and len(siman) > 8:
                    start = re.sub('([\[\*\]]|@..|#)', "", siman)
                    start_of_siman = re.split(" ", start)
                    if 'index' in locals():
                        if index >= len(shas):
                            break
            #            print "line number: 203", daf, amud
                        matching1(start_of_siman, shas, i, j, k, index, daf,
                                  amud)
                for match in linked:
                    lookfor = match.group(2)
                    # print "lookfor is", lookfor
                    tagged = re.split(" ", lookfor.strip())
                    if len(tagged) <= 1:
                        break
                    daf_amud = re.split(ur' ', match.group(1).strip())
                    #print daf_amud[0]
                    if len(daf_amud) >= 2:
                        daf = re.sub(ur'[^א-ת]', "", daf_amud[1])
                    else:
                        #   print "no  daf "
                        break
            #       print daf
                    daf = hebrew.heb_string_to_int(daf)
                    if daf > len(shas) or len(daf_amud) < 3:
                        break
                    else:
                        #  print len(daf_amud)
                        # print  daf_amud
                        amud = daf_amud[2]
                    index = ((daf - 2) * 2) + 1
                    #if index > len(shas):
                    #    break
                    if len(amud) < 3:
                        #print "short amud"
                        break
                    else:
                        #print amud
                        if amud[2].strip() == ur'א':
                            amud = 'a'
                            index = index - 1
                        else:
                            amud = 'b'
                        #print daf, amud
                        if index >= len(shas):
                            #print "short", daf, amud, lookfor
                            #return
                            pass
                        else:
                            #print "else"
                            #    print "tagged is", tagged
                            matching1(tagged, shas, i, j, k, index, daf, amud)
Пример #14
0
def read_rashi():
    for f in os.listdir(u'%s' % commentator):
        if masechet_he in f:
            pf = os.path.join(u'%s' % commentator, f)
            print pf
            split = re.split("_",f.strip())
            if "_" in masechet_he:
                daf_he =  split[2]
                amud_he = split[3][0]
            else:
                daf_he =  split[1]
                amud_he = split[2][0]
            daf =hebrew.heb_string_to_int(daf_he)
            if amud_he ==u'א':
                amud= "a"
            elif amud_he ==u'ב':
                amud="b"
            else:
                print "we have a problam"
            index = convert_daf_to_index(daf,amud)
            print index
            print daf
            print amud
            with open(pf, 'r') as filep:
                file_text = filep.read()
                list = re.split("\n",file_text)
                for liner in list:
                    if "-" in liner or "–" in liner:
                        #print line
                        dh = re.split("(?:-|–)",liner)[0]
                        match(dh.decode('utf-8'),shas[index],index,liner.decode('utf-8'))
Пример #15
0
def parse(text):
     old_num =0
     simanim = re.finditer(ur'@22\n*(.*)\n*@11(\n?.*)', text)
     for siman in simanim:
        #print siman.group(1)
        new_num = hebrew.heb_string_to_int(siman.group(1).strip())
        if new_num - old_num !=1:
            print siman.group(1)
            print new_num
        old_num = new_num
     print new_num
     bet_yosef=[]
     simanim = re.split("@22", text)
     for siman in simanim:
         seif =[]
         dh = re.split("@66", siman)
         i = 1
         for text in dh:
             bolded_dh = re.finditer(ur'@11(.*)@33(.*)',text)
             for bold in bolded_dh:
                new_dh = "[" + str(i) +"]" + "<b>"  + bold.group(1) + '</b>' + bold.group(2)
                seif.append(new_dh)
                i += 1
         bet_yosef.append(seif)
     print "length bet yosef", len(bet_yosef)
     #print bet_yosef[len(bet_yosef)-1][0]
     if len(bet_yosef[0])<2:
         return bet_yosef[1:len(bet_yosef)-1]
     else:
        return bet_yosef
Пример #16
0
def search1(parsed, part):
    for i, perek in enumerate(parsed):
        for k, seif in enumerate(perek):
            found = re.finditer(ur'@44\[דף(.*?)\](.*?)\.', seif)
            for find in found:
                daf = find.group(1)
                text = find.group(2)
                if daf.strip().split(" ")[1] == u'ע"א':
                    amud = 'a'
                elif daf.strip().split(" ")[1] == u'ע"ב':
                    amud = 'b'

                new_daf = daf.strip().split(" ")[0]
                try:
                    daf_num = hebrew.heb_string_to_int(new_daf)
                    #print str(daf_num) + amud
                    found = matchobj(daf_num, amud, text)
                    line = found[1][0]
                    if line > 0:
                        #print "Rosh on {}".format(masechet), daf_num, amud, found[1][0], str(i+1), ",", str(j+1) + ",", str(k+1)
                        talmud = "{}".format(masechet) + "." + str(
                            daf_num) + amud + "." + str(line)
                        roash = "Rosh on {}".format(
                            masechet) + ", " + part + "." + str(
                                i + 1) + "." + str(k + 1)

                        links.append(link(talmud, roash))
                except KeyError:
                    pass
Пример #17
0
def search(parsed):
    for i, perek in enumerate(parsed):
        for j, pasuk in enumerate(perek):
            found = re.finditer(ur'\(דף(.*?)\)', pasuk)
            for find in found:
                daf = find.group(1)
                #text =  find.group(2)
                if daf[len(daf) - 1] == '.':
                    #print daf
                    amud = 'a'
                elif daf[len(daf) - 1] == ':':
                    amud = 'b'

                new_daf = daf[0:len(daf.strip())].strip()
                #print new_daf
                try:
                    daf_num = hebrew.heb_string_to_int(new_daf)
                    #print str(daf_num) + amud +  " " + str(i) + " " + str(j+1)
                    links.append(
                        link(
                            "{}".format(masechet) + "." + str(daf_num) + amud,
                            "Rosh on {}, Hilchot Seder Avodat Yom HaKippurim".
                            format(masechet) + "." + str(i) + "." +
                            str(j + 1)))
                    #links.append(link(
                    #match(daf_num, amud, text)
                except KeyError:
                    pass
Пример #18
0
def read_rashi():
    for f in os.listdir(u'%s' % commentator):
        if masechet_he in f:
            pf = os.path.join(u'%s' % commentator, f)
            print pf
            split = re.split("_", f.strip())
            if "_" in masechet_he:
                daf_he = split[2]
                amud_he = split[3][0]
            else:
                daf_he = split[1]
                amud_he = split[2][0]
            daf = hebrew.heb_string_to_int(daf_he)
            if amud_he == u'א':
                amud = "a"
            elif amud_he == u'ב':
                amud = "b"
            else:
                print "we have a problam"
            index = convert_daf_to_index(daf, amud)
            print index
            print daf
            print amud
            with open(pf, 'r') as filep:
                file_text = filep.read()
                list = re.split("\n", file_text)
                for liner in list:
                    if "-" in liner or "–" in liner:
                        if commentator in "Rashi":
                            dh = re.split("(?:-|–)", liner)[0]
                        elif "Tosafot" in commentator:
                            dh = re.split(ur"\.", liner)[0]
                        match(dh.decode('utf-8'), shas[index], index,
                              liner.decode('utf-8'))
Пример #19
0
def search1(text, shas):
    for k, perek in enumerate(text):
        for i, seif in enumerate(perek):
            for j, siman in enumerate(seif):
                if siman.endswith(ur'5 '):
                    print "yes"
                linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman)
                if '@44' not in siman[0:10] and len(siman) > 8:
                    start = re.sub('([\[\*\]]|@..|#)',"",siman)
                    start_of_siman = re.split(" ", start)
                    if 'index' in locals():
                        if index >= len(shas):
                            break
            #            print "line number: 203", daf, amud
                        matching1(start_of_siman, shas, i, j, k, index, daf, amud)
                for match in linked:
                    lookfor = match.group(2)
                   # print "lookfor is", lookfor
                    tagged = re.split(" ", lookfor.strip())
                    if len(tagged)<=1:
                        break
                    daf_amud = re.split(ur' ', match.group(1).strip())
                    #print daf_amud[0]
                    if len(daf_amud)  >=2:
                        daf = re.sub(ur'[^א-ת]',"",daf_amud[1])
                    else:
                     #   print "no  daf "
                        break
             #       print daf
                    daf =  hebrew.heb_string_to_int(daf)
                    if daf > len(shas) or len(daf_amud) < 3:
                        break
                    else:
                      #  print len(daf_amud)
                       # print  daf_amud
                        amud = daf_amud[2]
                    index = ((daf-2)*2)+1
                    #if index > len(shas):
                    #    break
                    if len(amud) < 3:
                        #print "short amud"
                        break
                    else:
                        #print amud
                        if amud[2].strip() == ur'א':
                            amud = 'a'
                            index = index - 1
                        else:
                            amud = 'b'
                        #print daf, amud
                        if index >= len(shas):
                            #print "short", daf, amud, lookfor
                            #return
                            pass
                        else:
                            #print "else"
                        #    print "tagged is", tagged
                            matching1(tagged, shas, i, j, k, index, daf, amud)
Пример #20
0
def parse(text):
    older_siman = 0
    arbaturim=[]
    tur = []
    hilchos = re.split(ur'@00', text) #split to names of parts
    for halacha in hilchos:
        if len(halacha) >0:
            halacha_name = halacha.splitlines()[0]
            #print halacha_name #get the name of the part
        simanim = re.finditer(ur'(@?[0-9]?[0-9]?@?[0-9]?[0-9]?)@22(.*)@11(.*)',halacha) #cut the text to simanim, get kletter of siman and tags to commentary
        i = 1
        for simans in simanim:
             localbet_yosef = 0
             siman = simans.group(2)
             siman = re.sub(ur'[\(\[].*?[\)\]]',"", siman)
             siman = re.sub(ur'[^\u05d0-\u05ea]',"", siman)
             if len(siman)> 4:
                # print simans.group(2)
                 #print simans.group(3)
                pass
             roman_siman = hebrew.heb_string_to_int(siman.strip())
             bold = re.split(ur'@33',simans.group(3))
             if len(bold) ==2:
                 text = simans.group(1) +"<b>" + bold[0] + "</b>" + bold[1]
             else:
                 text =simans.group(1) + simans.group(3)
             #text1 = re.split(u"(.*?[.:])", text)
             #text1 = filter(None, text1)
             #taking care of links
             try:
                 for k in range(0,len(karo[len(tur)])):
                     #print len(tur)+1,k
                 #for k in range(1,len(re.findall("@66",simans.group(0)))):
                 #print simans.group(0)
                     if "@66" in simans.group(1):
                         links.append(addlink(len(tur)+1,1, k+1 ))
                     localbet_yosef += len(re.findall("@66",simans.group(2)))
                     if "@66" in simans.group(2):
                         links.append(addlink(len(tur)+1,1, k+1 ))
                     for sif_num,sifs in enumerate(text, start =1):
                         for a in range(1,len(re.findall("@66", sifs))):
                             links.append(addlink(len(tur)+1,sif_num , a+k+1 ))
                 #if localbet_yosef - len(karo[len(tur)+1]) != -1:
                    #print simans.group(2),roman_siman,  localbet_yosef, len(karo[len(tur)+1])
                  #  pass
                 if roman_siman - older_siman != 1:
                     print siman
                     print roman_siman
                 older_siman = roman_siman
                 text = re.sub(ur'@66', lambda m, c=count(1): '[{}]'.format(next(c)), text)
                 tur.append([text])
             except IndexError:
                 print "out of index"

    arbaturim.append(tur)
    depth = lambda L: isinstance(L, list) and max(map(depth, L))+1
    print depth(tur)
    return tur
Пример #21
0
def parse(text):
    if os.path.isfile("../source/Korban_Netanel_on_{}.txt".format(masechet)) or os.path.isfile(
        "../source/Pilpula_Charifta_on_{}.txt".format(masechet)
    ):
        # print "has korban netanel 2"
        nose_kelim = nosekelim.open_file()
        fixed = nosekelim.parse(nose_kelim)
        links_netanel = []
        netanel = 0
    rosh = []
    a = re.split(ur"@22([^@]*)", text)
    for seif, cont in zip(a[1::2], a[2::2]):
        si = []
        korban = []
        if ur"[*]" in seif and (
            os.path.isfile(
                "../source/Korban_Netanel_on_{}.txt".format(masechet)
                or os.path.isfile("../source/PilPula_Charifta_on_{}.txt".format(masechet))
            )
            and netanel <= len(fixed)
        ):
            if os.path.isfile("../source/Korban_Netanel_on_{}.txt".format(masechet)):
                commentator = "Korban Netanel on "
            if os.path.isfile("../source/PilPula_Charifta_on_{}.txt".format(masechet)):
                commentator = "Pilpula Charifta on "
            korban.append(fixed[netanel])
            # print len(links_netanel)
            roash = "Rosh on %s." % masechet + str(len(links_netanel) + 1) + ".1"
            netanelink = commentator + masechet + "." + str(len(links_netanel) + 1) + ".1"
            links.append(link(netanelink, roash))
            netanel += 1
            # print "netanel one seif", seif, netanel
            # print fixed[netanel]
        content = re.split("@66", cont)
        seif = re.sub(ur"[^א-ת]", "", seif)
        seif = hebrew.heb_string_to_int(seif.strip())
        for num, co in enumerate(content):
            if ur"[*]" in co:
                # print co
                a = re.findall("\[\*\](.{6})", co)
                for b in a:
                    if (
                        os.path.isfile("../source/Korban_netanel_on_{}.txt".format(masechet))
                        or os.path.isfile("../source/Pilpula_Charifta_on_{}.txt".format(masechet))
                    ) and netanel < len(fixed):
                        if os.path.isfile("../source/Korban_netanel_on_{}.txt".format(masechet)):
                            commentator = "Korban Netanel "
                        if os.path.isfile("../source/Pilpula_Charifta_on_{}.txt".format(masechet)):
                            commentator = "Pilpula Charifta "
                        korban.append(fixed[netanel])
                        roash = "Rosh on %s." % masechet + str(len(links_netanel) + 1) + "." + str(num + 1)
                        netanelink = (
                            commentator + "on " + masechet + "." + str(len(links_netanel) + 1) + "." + str(len(korban))
                        )
                        links.append(link(netanelink, roash))
                        netanel += 1
            si.append(co)
Пример #22
0
def parse1(text):
    if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
        nose_kelim = nosekelim.open_file()
        fixed = nosekelim.parse(nose_kelim)
        links_netanel = []
        netanel = 0
    rosh = []
    chapters = re.split(ur'@00', text)
    for chapter_num, chapter in enumerate(chapters):
        print chapter_num, chapter[0:10]
        if len(chapter)<=1:
            pass
        else:
            perek = []
            a = re.split(ur'@22([^@]*)', chapter)
            for seif, cont in zip(a[1::2], a[2::2]):
                si = []
                korban =[]
                print seif
                if ur'[*]' in seif:
                    print "hello1"
                if ur'[*]' in seif and (os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel <= len(fixed):

                    print "hello", seif, netanel, len(fixed)
                    if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)):
                        commentator = "Korban Netanel"
                    if os.path.isfile('source/PilPula_Charifta_on_{}.txt'.format(masechet)):
                        commentator = "Pilpula Charifta"
                    korban.append(fixed[netanel])
                    roash = "Rosh on %s." % masechet  +str(len(rosh)+2) + "." + str(len(perek)+1) + ".1"
                    netanelink = commentator + " on " +  masechet +"."+ str(len(links_netanel)+1) + ".1"
                    print roash, netanelink
                    links.append(link(netanelink, roash))
                    netanel += 1
                content = re.split('@66', cont)
                seif = re.sub(ur'[^א-ת]',"", seif)
                seif = hebrew.heb_string_to_int(seif.strip())
                for num, co in enumerate(content):
                    a = re.findall('\[\*\]', co)
                    for b in a:
                        print b, seif
                        if (os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel < len(fixed):
                            if os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)):
                                commentator = "Korban Netanel "
                            if os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
                                commentator = "Pilpula Charifta "
                            korban.append(fixed[netanel])
                            roash = "Rosh on %s." % masechet + str(len(rosh)+2) + "." + str(len(perek)+1) + "." + str(num+1)
                            netanelink = commentator + "on " + masechet + "." + str(len(links_netanel)+1)+ "."+ str(len(korban))
                            print roash, netanelink
                            links.append(link(netanelink, roash))
                            netanel +=1
                    si.append(co)
                if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
                    links_netanel.append(korban)
                perek.append(si)
            rosh.append(perek)
Пример #23
0
def parse1(text):
    if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
        nose_kelim = nosekelim.open_file()
        fixed = nosekelim.parse(nose_kelim)
        links_netanel = []
        netanel = 0
    rosh = []
    chapters = re.split(ur'@00', text)
    for chapter_num, chapter in enumerate(chapters):
        print chapter_num, chapter[0:10]
        if len(chapter)<=1:
            pass
        else:
            perek = []
            a = re.split(ur'@22([^@]*)', chapter)
            for seif, cont in zip(a[1::2], a[2::2]):
                si = []
                korban =[]
                print seif
                if ur'[*]' in seif:
                    print "hello1"
                if ur'[*]' in seif and (os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel <= len(fixed):

                    print "hello", seif, netanel, len(fixed)
                    if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)):
                        commentator = "Korban Netanel"
                    if os.path.isfile('source/PilPula_Charifta_on_{}.txt'.format(masechet)):
                        commentator = "Pilpula Charifta"
                    korban.append(fixed[netanel])
                    roash = "Rosh on %s." % masechet  +str(len(rosh)+2) + "." + str(len(perek)+1) + ".1"
                    netanelink = commentator + " on " +  masechet +"."+ str(len(links_netanel)+1) + ".1"
                    print roash, netanelink
                    links.append(link(netanelink, roash))
                    netanel += 1
                content = re.split('@66', cont)
                seif = re.sub(ur'[^א-ת]',"", seif)
                seif = hebrew.heb_string_to_int(seif.strip())
                for num, co in enumerate(content):
                    a = re.findall('\[\*\]', co)
                    for b in a:
                        print b, seif
                        if (os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel < len(fixed):
                            if os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)):
                                commentator = "Korban Netanel "
                            if os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
                                commentator = "Pilpula Charifta "
                            korban.append(fixed[netanel])
                            roash = "Rosh on %s." % masechet + str(len(rosh)+2) + "." + str(len(perek)+1) + "." + str(num+1)
                            netanelink = commentator + "on " + masechet + "." + str(len(links_netanel)+1)+ "."+ str(len(korban))
                            print roash, netanelink
                            links.append(link(netanelink, roash))
                            netanel +=1
                    si.append(co)
                if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
                    links_netanel.append(korban)
                perek.append(si)
            rosh.append(perek)
Пример #24
0
def parse_dapim(text):
    old_num = 1
    shas = 0
    count = 1
    ncount = 1
    tosafos = 0
    rashi = 0
    no_b = False
    amud_num = 'b'
    daf = re.split(ur'@[0-9][0-9](דף[^@]*)', text)
    print "length daf", len(daf)
    #print len(daf)
    chidushei_halachot = [[], []]
    for daf_num, content in zip(daf[1::2], daf[2::2]):
        #print daf_num
        count += 1
        cut_books = re.split(ur'@66([^@]*)', content)
        if len(re.findall(ur'[0-9][0-9]ע"ב', cut_books[0])) == 0:
            #print "is zero", daf_num
            no_b = True
        amudim = re.split(ur'(?:@44|@11)ע"ב', cut_books[0])
        for amud in amudim:
            if len(amudim) < 2:
                print len(amudim), daf_num
            DH = []
            if amud_num == 'b':
                amud_num = 'a'
            elif no_b == True:
                amud_num = 'a'
            else:
                amud_num = 'b'
            halachot = re.split(ur'@44', amud)
            for i, verse in enumerate(halachot):
                pverse = re.sub(ur'@..', "", verse)
                if len(pverse) < 3:
                    pverse = " "
                DH.append(pverse)
                if len(daf_num[3:]) > 3:
                    print "longer than 3", daf_num[3:], number
                number = hebrew.heb_string_to_int(
                    re.sub("'", "", daf_num[3:].strip()))
                if (number - old_num) < 0 or (number - old_num) > 1:
                    print "diff", number - old_num, daf_num
                old_num = number
                #print number
                if ur'רש"י' in verse[0:10]:
                    search_rashi(verse, number, amud_num, i + 1)
                    rashi += 1
                    pass
                elif ur'תוס' in verse[0:10]:
                    # search_tosafot(verse, number , amud_num, i+1)
                    tosafos += 1
                    pass
                else:
                    search_gemara(verse, number, amud_num, i + 1)
                    shas += 1
                    pass
Пример #25
0
def search2(parsed, part):
    for k, seif in enumerate(parsed):
        for i,pasuk in enumerate(seif):
            found =  re.finditer(ur'@44[\[\(](.*?)[\]\)]@55(.*?)\.', pasuk)
            for find in found:
                daf = find.group(1)
                if daf.strip().split(' ')[0] == u"מנחות"and len(daf.strip().split(' '))<6:
                    if len(daf.strip().split(' ')) ==3:
                        daf = daf.strip().split(' ')[2]
                    elif len(daf.strip().split(' ')) ==2:
                        daf = daf.strip().split(' ')[1]
                    if daf[-1] ==".":
                        amud ="a"
                    elif daf[-1] == ":":
                        amud = "b"
                    daf_num = hebrew.heb_string_to_int(daf[0:-1])
                    #print daf_num, amud
                elif daf.strip().split(' ')[0] == u"דף":
                    daf = daf.strip().split(' ')[1]
                    if daf[-1] ==".":
                        amud ="a"
                    elif daf[-1] == ":":
                        amud = "b"
                    daf_num = hebrew.heb_string_to_int(daf[0:-1])
                    #print daf_num, amud
                elif ur"שם" not in daf and ur"דף" in daf:
                    #print daf
                    pass
                else:
                    pass
                    #print daf
                text =  find.group(2)

                try:
                    print str(k+1), str(i+1),daf_num, amud
                    found = matchobj(daf_num, amud, text)
                    line = found[1][0]
                    if line >0:
                        #print "Rosh on {}".format(masechet), daf_num, amud, found[1][0], str(i+1), ",", str(j+1) + ",", str(k+1)
                        talmud = "{}".format(masechet) +  "." + str(daf_num) + amud + "." + str(line)
                        roash = "Rosh on {}".format(masechet) + ", " + part + "." + str(k+1) + "." + str(1)
                        links.append(makeLink(talmud,roash))
                except Exception as e:
                    print e
Пример #26
0
def parse_dapim(text):
    old_num = 1
    shas = 0
    count = 1
    ncount = 1
    tosafos = 0
    rashi = 0
    no_b=False
    amud_num = 'b'
    daf = re.split(ur'@[0-9][0-9](דף[^@]*)', text)
    print "length daf", len(daf)
    #print len(daf)
    chidushei_halachot = [[],[]]
    for daf_num, content in zip(daf[1::2], daf[2::2]):
        #print daf_num
        count+=1
        cut_books = re.split(ur'@66([^@]*)', content)
        if len(re.findall(ur'[0-9][0-9]ע"ב',cut_books[0] ))==0:
            #print "is zero", daf_num
            no_b= True
        amudim = re.split(ur'(?:@44|@11)ע"ב', cut_books[0])
        for amud in amudim:
            if len(amudim)<2:
                print len(amudim), daf_num
            DH = []
            if amud_num == 'b':
                amud_num = 'a'
            elif no_b == True:
                amud_num = 'a'
            else:
                amud_num='b'
            halachot = re.split(ur'@44',amud)
            for i, verse in enumerate(halachot):
                pverse = re.sub(ur'@..', "", verse)
                if len(pverse)<3:
                    pverse= " "
                DH.append(pverse)
                if len(daf_num[3:])>3:
                    print "longer than 3", daf_num[3:], number
                number = hebrew.heb_string_to_int(re.sub("'","",daf_num[3:].strip()))
                if (number - old_num) <0 or (number - old_num) >1:
                    print "diff", number - old_num, daf_num
                old_num = number
                #print number
                if ur'רש"י' in verse[0:10]:
                    search_rashi(verse, number , amud_num, i+1)
                    rashi += 1
                    pass
                elif ur'תוס' in verse[0:10]:
                   # search_tosafot(verse, number , amud_num, i+1)
                    tosafos += 1
                    pass
                else:
                   search_gemara(verse, number , amud_num, i+1)
                   shas += 1
                   pass
Пример #27
0
def divrey_chamuot2(text):
    chamudotlinks = []
    count = 0
    file = tiferet_shmuel.open_file(record="chamudot")
    parsed = tiferet_shmuel.parse(file)
    Helper.createBookRecord(tiferet_shmuel.book_record(record="chamudot"))
    tiferet_shmuel.save_parsed_text(parsed, record="chamudot")
    tiferet_shmuel.run_post_to_api(record="chamudot")
    commentator = "Divrey Chamudot"
    rosh = []
    chapters = re.split(ur"(?:@00|@99)", text)
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        print chapter_num
        if len(chapter) <= 1:
            pass
        else:
            perek = []
            a = re.split(ur"@22([^@]*)", chapter)
            for seif, cont in zip(a[1::2], a[2::2]):
                si = []
                print seif
                if ur"(*)" in seif:
                    print "hello1"
                if ur"(*)" in seif:

                    count += 1
                    roash = "Rosh on %s." % masechet + str(len(rosh) + 1) + "." + str(len(perek) + 1) + ".1"
                    shmuel = commentator + " on " + masechet + "." + str(count)
                    chamudotlinks.append(link(roash, shmuel))
                    print roash, shmuel
                content = re.split("@66", cont)
                seif = re.sub(ur"[^א-ת]", "", seif)
                seif = hebrew.heb_string_to_int(seif.strip())
                for num, co in enumerate(content):
                    a = re.findall("\(\*\)", co)
                    for b in a:
                        count += 1
                        roash = (
                            "Rosh on %s." % masechet
                            + str(len(rosh) + 1)
                            + "."
                            + str(len(perek) + 1)
                            + "."
                            + str(num + 1)
                        )
                        shmuel = commentator + " on " + masechet + "." + str(count)
                        print roash, shmuel
                        chamudotlinks.append(link(shmuel, roash))
                    # print parsed[count]
                    si.append(co)
                perek.append(si)
            rosh.append(perek)
Пример #28
0
def divrey_chamuot2(text):
    chamudotlinks = []
    count = 0
    file = tiferet_shmuel.open_file(record="chamudot")
    parsed = tiferet_shmuel.parse(file)
    Helper.createBookRecord(tiferet_shmuel.book_record(record="chamudot"))
    tiferet_shmuel.save_parsed_text(parsed, record="chamudot")
    tiferet_shmuel.run_post_to_api(record="chamudot")
    commentator = "Divrey Chamudot"
    rosh = []
    chapters = re.split(ur'(?:@00|@99)', text)
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        print chapter_num
        if len(chapter) <= 1:
            pass
        else:
            perek = []
            a = re.split(ur'@22([^@]*)', chapter)
            for seif, cont in zip(a[1::2], a[2::2]):
                si = []
                print seif
                if ur'(*)' in seif:
                    print "hello1"
                if ur'(*)' in seif:

                    count += 1
                    roash = "Rosh on %s." % masechet + str(
                        len(rosh) + 1) + "." + str(len(perek) + 1) + ".1"
                    shmuel = commentator + " on " + masechet + "." + str(count)
                    chamudotlinks.append(link(roash, shmuel))
                    print roash, shmuel
                content = re.split('@66', cont)
                seif = re.sub(ur'[^א-ת]', "", seif)
                seif = hebrew.heb_string_to_int(seif.strip())
                for num, co in enumerate(content):
                    a = re.findall('\(\*\)', co)
                    for b in a:
                        count += 1
                        roash = "Rosh on %s." % masechet + str(
                            len(rosh) + 1) + "." + str(len(perek) +
                                                       1) + "." + str(num + 1)
                        shmuel = commentator + " on " + masechet + "." + str(
                            count)
                        print roash, shmuel
                        chamudotlinks.append(link(shmuel, roash))
                    #print parsed[count]
                    si.append(co)
                perek.append(si)
            rosh.append(perek)
Пример #29
0
def links(clean_text, shas):
    for i, page in enumerate (clean_text):
        for j,chapter in enumerate(page):
            a = re.finditer(ur"@88(.+?)@77(.+)", chapter)
            for link in a:
                heb_links= re.split(" ",link.group(1))
                daf =heb_links[1]
                amud = heb_links[2]
                if amud[2]==ur'א':
                      eng_amud = 'a'
                elif amud[2]==ur'ב':
                    eng_amud=ur'b'
                daf = hebrew.heb_string_to_int(daf)
                quote = re.split(" ",link.group(2).strip())
                matching(quote, daf, eng_amud,i,j, shas,words =len(quote), ratio = 70)
def run_parser():
    print "running parser"
    parsed_text = []
    cur_chapter = 1
    cur_verse = 1
    #regex = re.compile(ur'@11(.*)@22',re.UNICODE)
    with open("source/Radak_on_Genesis.txt", 'r') as filep:
        file_text = filep.read()
        ucd_text = unicode(file_text, 'utf-8').strip()

    #get rid of some unhelpful markup
    ucd_text = re.sub(ur'@11(.*?)@33', ur'@55\1', ucd_text)
    ucd_text = re.sub(ur'@00([^@]*)\n', '', ucd_text)
    ucd_text = ucd_text.replace(u'@44(שם)@55', u'(שם)')
    #split according to chapter. Will also include the chapter letters in the results.
    chapters = re.split(ur'@22([^@]*)', ucd_text)
    for chapter_num, chapter in zip(chapters[1::2],chapters[2::2]):
        if chapter_num.strip() != '':
            cur_chapter = hebrew.heb_string_to_int(chapter_num.strip())
            parsed_chapter = []
            expand_list_assign(parsed_text, cur_chapter-1, parsed_chapter)
        #now split on verse numbers, capturing the verse numbers as well
        verses = re.split(ur'@44\(([\u0590-\u05ea]{1,2})\)',chapter)
        for verse_num, verse in zip(verses[1::2], verses[2::2]):
            if verse_num.strip() != '':
                parsed_verse = []
                cur_verse = hebrew.heb_string_to_int(verse_num.strip())
                expand_list_assign(parsed_chapter, cur_verse-1, parsed_verse)
            comments = verse.split('@55')[1:]
            for comment in comments:
                if comment.strip() != '':
                    parsed_verse.append(comment)


    pretty_print(parsed_text)
    save_parsed_text(parsed_text)
Пример #31
0
def link_to_link(link):
    if len(link.strip().split(" ")) ==2 and link[0]==ur'ד':
        dafamud = link.strip().split(" ")[1]
        amods = dafamud[len(dafamud)-1]
        if amods == ":":
            amod = "b"
        elif amods ==".":
            amod ="a"
        dap = dafamud[0:len(dafamud)-1]

        roman_daf = hebrew.heb_string_to_int(dap.strip())
        return masechet + "." + str(roman_daf) + amod
    elif link[0]==ur'ד':
        pass
        #print link
    elif '.' in link or ":" in link:
        pass
Пример #32
0
def link_to_link(link):
    if len(link.strip().split(" ")) == 2 and link[0] == ur'ד':
        dafamud = link.strip().split(" ")[1]
        amods = dafamud[len(dafamud) - 1]
        if amods == ":":
            amod = "b"
        elif amods == ".":
            amod = "a"
        dap = dafamud[0:len(dafamud) - 1]

        roman_daf = hebrew.heb_string_to_int(dap.strip())
        return masechet + "." + str(roman_daf) + amod
    elif link[0] == ur'ד':
        pass
        #print link
    elif '.' in link or ":" in link:
        pass
Пример #33
0
def search(text, shas):
    for i, seif in enumerate(text):
        for j, siman in enumerate(seif):
            if siman.endswith(ur'5 '):
               print "yes"
            linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman)
            if '@44' not in siman[0:10] and len(siman) > 8:
                start = re.sub('([\[\*\]]|@..|#)',"",siman)
                start_of_siman = re.split(" ", start)
       #         print "start of siman", start_of_siman[0]
                if 'index' in locals():
                 #   print "matching", index
                    if index > len(shas):
                        break

                    matching(start_of_siman, shas, i, j, index, daf, amud)
            for match in linked:
                lookfor = match.group(2)
                tagged = re.split(" ", lookfor.strip())
                daf_amud = re.split(ur' ', match.group(1).strip())
                daf = re.sub(ur'[^א-ת]',"",daf_amud[1])
                daf =  hebrew.heb_string_to_int(daf)
                if daf > len(shas) or len(daf_amud) <= 2:
         #           print "daf", daf, "is longer than needs to"
                    break
                else:
                    print daf_amud[0]
                    amud = daf_amud[2]
               # print "amud", amud
                index = ((daf-2)*2)+1
                #if index > len(shas):
                #    break
                if amud[2].strip() == ur'א':
                    amud = 'a'
                    index = index - 1
                else:
                    amud = 'b'
                #print "daf", daf, amud
                if index >= len(shas):
          #          print "short", daf, amud, lookfor
                    return
                    #break
                else:
           #         print "else"
                    matching(tagged, shas, i, j, index, daf, amud)
Пример #34
0
def search(text, shas):
    for i, seif in enumerate(text):
        for j, siman in enumerate(seif):
            if siman.endswith(ur'5 '):
                print "yes"
            linked = re.finditer(ur'@44(.*?)@(?:55|11)(.*?)(?=(@44|$))', siman)
            if '@44' not in siman[0:10] and len(siman) > 8:
                start = re.sub('([\[\*\]]|@..|#)', "", siman)
                start_of_siman = re.split(" ", start)
                #         print "start of siman", start_of_siman[0]
                if 'index' in locals():
                    #   print "matching", index
                    if index > len(shas):
                        break

                    matching(start_of_siman, shas, i, j, index, daf, amud)
            for match in linked:
                lookfor = match.group(2)
                tagged = re.split(" ", lookfor.strip())
                daf_amud = re.split(ur' ', match.group(1).strip())
                daf = re.sub(ur'[^א-ת]', "", daf_amud[1])
                daf = hebrew.heb_string_to_int(daf)
                if daf > len(shas) or len(daf_amud) <= 2:
                    #           print "daf", daf, "is longer than needs to"
                    break
                else:
                    print daf_amud[0]
                    amud = daf_amud[2]
            # print "amud", amud
                index = ((daf - 2) * 2) + 1
                #if index > len(shas):
                #    break
                if amud[2].strip() == ur'א':
                    amud = 'a'
                    index = index - 1
                else:
                    amud = 'b'
                #print "daf", daf, amud
                if index >= len(shas):
                    #          print "short", daf, amud, lookfor
                    pass
                    #break
                else:
                    #         print "else"
                    matching(tagged, shas, i, j, index, daf, amud)
Пример #35
0
def yomtov2(text):
    chamudotlinks = []
    count = 0
    file = tiferet_shmuel.open_file(record = "yomtov")
    parsed = tiferet_shmuel.parse(file)
    Helper.createBookRecord(tiferet_shmuel.book_record(record = "yomtov"))
    tiferet_shmuel.save_parsed_text(parsed, record = "yomtov")
    tiferet_shmuel.run_post_to_api(record = "yomtov")
    commentator = "Maadaney Yom Tov"
    rosh = []
    chapters = re.split(ur'(?:@00|@99)', text)
    for chapter_num, chapter in enumerate(chapters):
        if len(chapter)<=1:
            pass
        else:
            perek = []
            a = re.split(ur'@22([^@]*)', chapter)
            for seif, cont in zip(a[1::2], a[2::2]):
                si = []
                print seif
                if ur'[*]' in seif:
                    print "hello1"
                if ur'[*]' in seif:

                    count+=1
                    roash = "Rosh on %s." % masechet  +str(len(rosh)+1) + "." + str(len(perek)+1) + ".1"
                    shmuel = commentator + " on " +  masechet +"."+ str(count)
                    chamudotlinks.append(link(roash, shmuel))
                    print roash, shmuel
                content = re.split('@66', cont)
                seif = re.sub(ur'[^א-ת]',"", seif)
                seif = hebrew.heb_string_to_int(seif.strip())
                for num, co in enumerate(content):
                    a = re.findall('\[\*\]', co)
                    for b in a:
                        count+=1
                        roash = "Rosh on %s." % masechet + str(len(rosh)+1) + "." + str(len(perek)+1) + "." + str(num+1)
                        shmuel = commentator + " on " + masechet + "." + str(count)
                        print roash, shmuel
                        chamudotlinks.append(link(shmuel, roash))
                    #print parsed[count]
                    si.append(co)
                perek.append(si)
            rosh.append(perek)
Пример #36
0
def parse(text):
    if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
       # print "has korban netanel 2"
        nose_kelim = nosekelim.open_file()
        fixed = nosekelim.parse(nose_kelim)
        links_netanel = []
        netanel = 0
    rosh = []
    a = re.split(ur'@22([^@]*)', text)
    for seif, cont in zip(a[1::2], a[2::2]):
        si = []
        korban =[]
        if ur'[*]' in seif and (os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet) or os.path.isfile('source/PilPula_Charifta_on_{}.txt'.format(masechet))) and netanel <= len(fixed)):
            if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)):
                commentator = "Korban Netanel on "
            if os.path.isfile('source/PilPula_Charifta_on_{}.txt'.format(masechet)):
                commentator = "Pilpula Charifta on "
            korban.append(fixed[netanel])
            #print len(links_netanel)
            roash = "Rosh on %s." % masechet + str(len(links_netanel)+1) + ".1"
            netanelink = commentator + masechet +"."+ str(len(links_netanel)+1) + ".1"
            links.append(link(netanelink, roash))
            netanel += 1
            #print "netanel one seif", seif, netanel
            #print fixed[netanel]
        content = re.split('@66', cont)
        seif = re.sub(ur'[^א-ת]',"", seif)
        seif = hebrew.heb_string_to_int(seif.strip())
        for num, co in enumerate(content):
            if ur'[*]' in co:
                print co
                a = re.findall('\[\*\](.{6})', co)
                for b in a:
                    if (os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel < len(fixed):
                        if os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)):
                            commentator = "Korban Netanel "
                        if os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
                            commentator = "Pilpula Charifta "
                        korban.append(fixed[netanel])
                        roash = "Rosh on %s." % masechet + str(len(links_netanel)+1) + "." + str(num+1)
                        netanelink = commentator + "on " +masechet + "." + str(len(links_netanel)+1)+ "."+ str(len(korban))
                        links.append(link(netanelink, roash))
                        netanel +=1
            si.append(co)
Пример #37
0
def search(parsed):
    for i in parsed:
        for j in i:
            found =  re.finditer(ur'\(דף(.*?)\)(.*?)\(', j)
            for find in found:
                daf = find.group(1)
                text =  find.group(2)
                if daf[len(daf)-1] == '.':
                    #print daf
                    amud = 'a'
                elif daf[len(daf)-1] == ':':
                    amud = 'b'

                new_daf = daf[0:len(daf.strip())].strip()
                #print new_daf
                try:
                    daf_num = hebrew.heb_string_to_int(new_daf)
                    #print str(daf_num) + amud
                    match(daf_num, amud, text)
                except KeyError:
                    pass
Пример #38
0
def search1(parsed):
    for i in parsed:
       for k in i:
           for j in k:
               found =  re.finditer(ur'@44\[דף(.*?)\](.*?)\(', j)
               for find in found:
                   daf = find.group(1)

                   text =  find.group(2)

                   if daf.strip().split(" ")[1] == u'ע"א':
                       amud = 'a'
                   elif daf.strip().split(" ")[1] == u'ע"ב':
                       amud = 'b'

                   new_daf = daf.strip().split(" ")[0]
                   try:
                       daf_num = hebrew.heb_string_to_int(new_daf)
                       #print str(daf_num) + amud
                       match(daf_num, amud, text)
                   except KeyError:
                       pass
Пример #39
0
def links(clean_text, shas):
    for i, page in enumerate(clean_text):
        for j, chapter in enumerate(page):
            a = re.finditer(ur"@88(.+?)@77(.+)", chapter)
            for link in a:
                heb_links = re.split(" ", link.group(1))
                daf = heb_links[1]
                amud = heb_links[2]
                if amud[2] == ur'א':
                    eng_amud = 'a'
                elif amud[2] == ur'ב':
                    eng_amud = ur'b'
                daf = hebrew.heb_string_to_int(daf)
                quote = re.split(" ", link.group(2).strip())
                matching(quote,
                         daf,
                         eng_amud,
                         i,
                         j,
                         shas,
                         words=len(quote),
                         ratio=70)
Пример #40
0
def search(parsed):
    for i,perek in enumerate(parsed):
        for j,pasuk in enumerate(perek):
            found =  re.finditer(ur'\(דף(.*?)\)', pasuk)
            for find in found:
                daf = find.group(1)
                #text =  find.group(2)
                if daf[len(daf)-1] == '.':
                    #print daf
                    amud = 'a'
                elif daf[len(daf)-1] == ':':
                    amud = 'b'

                new_daf = daf[0:len(daf.strip())].strip()
                #print new_daf
                try:
                    daf_num = hebrew.heb_string_to_int(new_daf)
                    #print str(daf_num) + amud +  " " + str(i) + " " + str(j+1)
                    links.append(link("{}".format(masechet) + "." + str(daf_num) + amud, "Rosh on {}, Hilchot Seder Avodat Yom HaKippurim".format(masechet) + "." + str(i) + "." + str(j+1)))
                    #links.append(link(
                    #match(daf_num, amud, text)
                except KeyError:
                    pass
Пример #41
0
def parse(text):
    links_netanel = []
    netanel = 0
    rosh = []
    a = re.split(ur'@22([^@]*)', text)
    for seif, cont in zip(a[1::2], a[2::2]):
        si = []
        if ur'[*]' in seif:
                print seif
                netanel += 1
        #si.append(seif)
        content = re.split('@66', cont)
        seif = re.sub(ur'[\s\[\*\]]',"", seif)
        seif = hebrew.heb_string_to_int(seif.strip())
        for num, co in enumerate(content):
            a = re.findall('\[\*\](.{6})', co)
            #for b in a:
                #print b
            netanel +=len(a)
            #print seif, num, netanel - len(a), netanel
            #print len(a)
            si.append(co)
        rosh.append(si)
Пример #42
0
def parse(text):
    links_netanel = []
    netanel = 0
    rosh = []
    a = re.split(ur'@22([^@]*)', text)
    for seif, cont in zip(a[1::2], a[2::2]):
        si = []
        if ur'[*]' in seif:
            print seif
            netanel += 1
        #si.append(seif)
        content = re.split('@66', cont)
        seif = re.sub(ur'[\s\[\*\]]', "", seif)
        seif = hebrew.heb_string_to_int(seif.strip())
        for num, co in enumerate(content):
            a = re.findall('\[\*\](.{6})', co)
            #for b in a:
            #print b
            netanel += len(a)
            #print seif, num, netanel - len(a), netanel
            #print len(a)
            si.append(co)
        rosh.append(si)
Пример #43
0
def parse(text):
    old_num = 0
    dibbur = ""
    #simanim = re.finditer(ur'(@[0-9][0-9])\n?(@[0-9][0-9])(.*\n*)', text)
    simanim = re.split("@77", text)
    bayit_chadash = []
    perek = []
    i = 1
    for siman in simanim:
        simans = re.finditer("@11(.*)@33(.*)", siman)

        for s in simans:
            dibbur = "(" + str(i) + ")" + "<b>" + s.group(
                1) + '</b>' + s.group(2)
            print i
            i = i + 1
        if "@22" not in siman:

            perek.append(dibbur)
        elif "@22" in siman:
            #i = 1
            num = re.findall("@22(.*)", siman)[0]
            new_num = hebrew.heb_string_to_int(num.strip())
            #print new_num
            if new_num - old_num != 1:
                for k in range(1, new_num - old_num):
                    bayit_chadash.append([])
            old_num = new_num
            bayit_chadash.append(perek)
            perek = []

            perek.append(dibbur)
            i = 1
    bayit_chadash.append(perek)
    #print len(bayit_chadash)
    return bayit_chadash[1:len(bayit_chadash)]
Пример #44
0
def parse(text):
    agadot=[[],[]]
    old_number = 1
    dappim = re.split(ur'@[0-9][0-9]ח"א([^@]*)', text)
    for daf, content in zip(dappim[1::2],dappim[2::2]):
        same = False
        ab = False
        seifim =[]
        if len(daf.split(" ")) > 4:
            if len(daf.split(" "))>=5:
                string= daf.split(" ")[4]
                #print string
            daf_n = daf.split(" ")[2]
            print daf_n
            amud = daf.split(" ")[3].strip()
            if amud[2].strip()== ur"א":
                amuds = 'a'
            elif amud[2].strip() == ur"ב":
                amuds ='b'
            else:
                print "did it get here", amud
        else:
            continue
        number =  hebrew.heb_string_to_int(daf_n)
        if number - old_number==0:
            same =True
        if number - old_number>1:
            for i in range(1,number-old_number):
                agadot.append([])
                agadot.append([])
        old_number = number
        simanim = re.finditer(ur'(?:@[0-9][0-9]|[0-9])(.*)',content)
        for match in simanim:
            if re.search(ur'[0-9]דף', match.group(0)) is not None:
                print match.group(0)
                break
            siman = match.group(0)
            siman = re.split("@77([^(?:@|[0-9]]*)",siman)
            for simans in siman:
                if simans != "":
                    simanim = re.split('(?:@[0-9][0-9]|[0-9])',simans)
                    if len(simanim[0])>1:
                        simanim_string = "<b>" +string + " " + '</b>'+ simanim[0]
                        seifim.append(simanim_string)
                        if ur'<b>ע"ב' in simanim_string:
                            amuds="b"
                            print simanim_string
                            agadot.append(seifim)
                            seifim=[]
                            seifim.append(simanim_string)
                            ab = True
                    if len(simanim) > 1:
                       for i in range(1,len(simanim)-1,2):
                            simanim_string = ur'<b>' + simanim[i] + " " + ur'</b>' + simanim[i+1]
                            if u'<b>ע"ב' in simanim_string:
                                print daf_n
                                amuds="b"
                                print simanim_string
                                agadot.append(seifim)
                                seifim=[]
                                ab = True
                            if len(simanim_string) > 1:
                                seifim.append(simanim_string)
Пример #45
0
def parse(text):
    i = 0
    kb = re.split(ur"@00הלכות כלאי בגדים(.*?)@00פרק תשיעי", text)
    begadim = kb[1]
    ending = re.split(ur"@00הלכות מקוואות", kb[2])
    bdy = kb[0] + ending[0]
    mikva = ending[1]
    old_numeri = 0
    rosh = []
    kileiggadim = []
    hilchotmikvaot = []
    chapters = re.split(ur'(?:@00|@99)([^@]*)', bdy)
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        mispar = chapter_num.strip().split(" ")[1]
        if mispar.encode('utf-8') in misparim.keys():
            mispar_numeri = misparim[mispar.encode('utf-8')]
            print mispar_numeri
            if mispar_numeri - old_numeri > 1:
                for i in range(1, mispar_numeri - old_numeri):
                    rosh.append([])
            old_numeri = mispar_numeri
        print mispar
        #if len(chapter)<=1:
        #   pass
        #else:
        perek = []
        a = re.split(ur'@22([^@]*)', chapter)
        for seif, cont in zip(a[1::2], a[2::2]):
            si = []
            content = re.split('@66', cont)
            seif = re.sub(ur'[^א-ת]', "", seif)
            seif = hebrew.heb_string_to_int(seif.strip())
            for num, co in enumerate(content):
                a = re.findall('\[\*\]', co)
                #for b in a:
                #print b, seif
                si.append(co)
            perek.append(si)
        if len(perek) is not 0:
            rosh.append(perek)
        # print len(rosh)
    search2(rosh)

    #take care of begadim
    b = re.split(ur'@22([^@]*)', begadim)
    for sei, con in zip(b[1::2], b[2::2]):
        si = []
        conten = re.split('@66', con)
        sei = re.sub(ur'[^א-ת]', "", sei)
        sei = hebrew.heb_string_to_int(sei.strip())
        for num, co in enumerate(conten):
            b = re.findall('\[\*\]', co)
            #for c in b:
            #print c, sei
            si.append(co)
        kileiggadim.append(si)
    b = re.split(ur'@22([^@]*)', mikva)
    for sei, con in zip(b[1::2], b[2::2]):
        si = []
        conten = re.split('@66', con)
        sei = re.sub(ur'[^א-ת]', "", sei)
        sei = hebrew.heb_string_to_int(sei.strip())
        for num, co in enumerate(conten):
            b = re.findall('\[\*\]', co)
            for c in b:
                print c, sei
            si.append(co)
        hilchotmikvaot.append(si)
    #take care of mikva
    return rosh, kileiggadim, hilchotmikvaot
Пример #46
0
def parse(text):
    i=0
    kb = re.split(ur"@00הלכות כלאי בגדים(.*?)@00פרק תשיעי", text)
    begadim = kb[1]
    ending = re.split(ur"@00הלכות מקוואות",kb[2])
    bdy = kb[0] + ending[0]
    mikva = ending[1]
    old_numeri = 0
    rosh = []
    kileiggadim =[]
    hilchotmikvaot =[]
    chapters = re.split(ur'(?:@00|@99)([^@]*)', bdy)
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        mispar = chapter_num.strip().split(" ")[1]
        if mispar.encode('utf-8') in misparim.keys():
            mispar_numeri = misparim[mispar.encode('utf-8')]
            print mispar_numeri
            if mispar_numeri - old_numeri > 1:
               for i in range(1,mispar_numeri-old_numeri):
                    rosh.append([])
            old_numeri = mispar_numeri
        print mispar
        #if len(chapter)<=1:
         #   pass
        #else:
        perek = []
        a = re.split(ur'@22([^@]*)', chapter)
        for seif, cont in zip(a[1::2], a[2::2]):
            si = []
            content = re.split('@66', cont)
            seif = re.sub(ur'[^א-ת]',"", seif)
            seif = hebrew.heb_string_to_int(seif.strip())
            for num, co in enumerate(content):
                a = re.findall('\[\*\]', co)
                #for b in a:
                    #print b, seif
                si.append(co)
            perek.append(si)
        if len(perek) is not 0:
            rosh.append(perek)
           # print len(rosh)
    search2(rosh)

    #take care of begadim
    b  = re.split(ur'@22([^@]*)',begadim )
    for sei, con in zip(b[1::2], b[2::2]):
        si = []
        conten = re.split('@66', con)
        sei = re.sub(ur'[^א-ת]',"", sei)
        sei = hebrew.heb_string_to_int(sei.strip())
        for num, co in enumerate(conten):
            b = re.findall('\[\*\]', co)
            #for c in b:
                #print c, sei
            si.append(co)
        kileiggadim.append(si)
    b  = re.split(ur'@22([^@]*)', mikva)
    for sei, con in zip(b[1::2], b[2::2]):
        si = []
        conten = re.split('@66', con)
        sei = re.sub(ur'[^א-ת]',"", sei)
        sei = hebrew.heb_string_to_int(sei.strip())
        for num, co in enumerate(conten):
            b = re.findall('\[\*\]', co)
            for c in b:
                print c, sei
            si.append(co)
        hilchotmikvaot.append(si)
    #take care of mikva
    return rosh , kileiggadim,hilchotmikvaot
Пример #47
0
def parse1(text):
    old_numeri = 0
    if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
        nose_kelim = nosekelim.open_file()
        fixed = nosekelim.parse(nose_kelim)
        links_netanel = []
        netanel = 0
    rosh = []
    chapters = re.split(ur'(?:@00|@99)([^@]*)', text)
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        mispar = chapter_num.strip().split(" ")[1]
        if mispar.encode('utf-8') in misparim.keys():
            mispar_numeri = misparim[mispar.encode('utf-8')]
            print mispar_numeri
            if mispar_numeri - old_numeri > 1:
               for i in range(1,mispar_numeri-old_numeri):
                    rosh.append([])
                    #print "length of rosh", len(rosh)
            old_numeri = mispar_numeri
        print mispar
        #if len(chapter)<=1:
         #   pass
        #else:
        perek = []
        a = re.split(ur'@22([^@]*)', chapter)
        for seif, cont in zip(a[1::2], a[2::2]):
            si = []
            korban =[]
            #print seif
            if ur'[*]' in seif and (os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel < len(fixed):
               # print "hello", seif, netanel, len(fixed)
                if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)):
                    commentator = "Korban Netanel"
                if os.path.isfile('source/PilPula_Charifta_on_{}.txt'.format(masechet)):
                    commentator = "Pilpula Charifta"
                korban.append(fixed[netanel])
                roash = "Rosh on %s." % masechet  +str(len(rosh)+1) + "." + str(len(perek)+1) + ".1"
                netanelink = commentator + " on " +  masechet +"."+ str(len(links_netanel)+1) + ".1"
                #print roash, netanelink
                links.append(link(netanelink, roash))
                netanel += 1
            content = re.split('@66', cont)
            seif = re.sub(ur'[^א-ת]',"", seif)
            seif = hebrew.heb_string_to_int(seif.strip())
            for num, co in enumerate(content):
                a = re.findall('\[\*\]', co)
                for b in a:
                 #   print b, seif
                    if (os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet))) and netanel < len(fixed):
                        if os.path.isfile('source/Korban_netanel_on_{}.txt'.format(masechet)):
                            commentator = "Korban Netanel "
                        if os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
                            commentator = "Pilpula Charifta "
                        korban.append(fixed[netanel])
                        roash = "Rosh on %s." % masechet + str(len(rosh)+1) + "." + str(len(perek)+1) + "." + str(num+1)
                        netanelink = commentator + "on " + masechet + "." + str(len(links_netanel)+1)+ "."+ str(len(korban))
                        #print roash, netanelink
                        links.append(link(netanelink, roash))
                        netanel +=1
                si.append(co)
            if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(masechet)) or os.path.isfile('source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
                links_netanel.append(korban)
            perek.append(si)
        if len(perek) is not 0:
            rosh.append(perek)
Пример #48
0
def parse(text):
    agadot=[[],[]]
    old_number = 1
    dappim = re.split(ur'@[0-9][0-9]ח"א([^@]*)', text)
    for daf, content in zip(dappim[1::2],dappim[2::2]):
        same = False
        ab = False
        seifim =[]
        if len(daf.split(" ")) > 4:
            if len(daf.split(" "))>=5:
                string= daf.split(" ")[4]
                #print string
            daf_n = daf.split(" ")[2]
            print daf_n
            amud = daf.split(" ")[3].strip()
            if amud[2].strip()== ur"א":
                amuds = 'a'
            elif amud[2].strip() == ur"ב":
                amuds ='b'
            else:
                print "did it get here", amud
        else:
            continue
        number =  hebrew.heb_string_to_int(daf_n)
        if number - old_number==0:
            same =True
        if number - old_number>1:
            for i in range(1,number-old_number):
                agadot.append([])
                agadot.append([])
        old_number = number
        simanim = re.finditer(ur'(?:@[0-9][0-9]|[0-9])(.*)',content)
        for match in simanim:
            if re.search(ur'[0-9]דף', match.group(0)) is not None:
                print match.group(0)
                break
            siman = match.group(0)
            siman = re.split("@77([^(?:@|[0-9]]*)",siman)
            for simans in siman:
                if simans != "":
                    simanim = re.split('(?:@[0-9][0-9]|[0-9])',simans)
                    if len(simanim[0])>1:
                        simanim_string = "<b>" +string + " " + '</b>'+ simanim[0]
                        seifim.append(simanim_string)
                        if ur'<b>ע"ב' in simanim_string:
                            amuds="b"
                            print simanim_string
                            agadot.append(seifim)
                            seifim=[]
                            seifim.append(simanim_string)
                            ab = True
                    if len(simanim) > 1:
                       for i in range(1,len(simanim)-1,2):
                            simanim_string = ur'<b>' + simanim[i] + " " + ur'</b>' + simanim[i+1]
                            if u'<b>ע"ב' in simanim_string:
                                print daf_n
                                amuds="b"
                                print simanim_string
                                agadot.append(seifim)
                                seifim=[]
                                ab = True
                            if len(simanim_string) > 1:
                                seifim.append(simanim_string)
Пример #49
0
def parse(text):
    older_siman = 0
    arbaturim = []
    tur = []
    hilchos = re.split(ur'@00', text)  #split to names of parts
    for halacha in hilchos:
        if len(halacha) > 0:
            halacha_name = halacha.splitlines()[0]
            #print halacha_name #get the name of the part
        simanim = re.finditer(
            ur'(@?[0-9]?[0-9]?@?[0-9]?[0-9]?)@22(.*)@11(.*)', halacha
        )  #cut the text to simanim, get kletter of siman and tags to commentary
        i = 1
        for simans in simanim:
            localbet_yosef = 0
            siman = simans.group(2)
            siman = re.sub(ur'[\(\[].*?[\)\]]', "", siman)
            siman = re.sub(ur'[^\u05d0-\u05ea]', "", siman)
            if len(siman) > 4:
                # print simans.group(2)
                #print simans.group(3)
                pass
            roman_siman = hebrew.heb_string_to_int(siman.strip())
            bold = re.split(ur'@33', simans.group(3))
            if len(bold) == 2:
                text = simans.group(1) + "<b>" + bold[0] + "</b>" + bold[1]
            else:
                text = simans.group(1) + simans.group(3)
            #text1 = re.split(u"(.*?[.:])", text)
            #text1 = filter(None, text1)
            #taking care of links
            try:
                for k in range(0, len(karo[len(tur)])):
                    #print len(tur)+1,k
                    #for k in range(1,len(re.findall("@66",simans.group(0)))):
                    #print simans.group(0)
                    if "@66" in simans.group(1):
                        links.append(addlink(len(tur) + 1, 1, k + 1))
                    localbet_yosef += len(re.findall("@66", simans.group(2)))
                    if "@66" in simans.group(2):
                        links.append(addlink(len(tur) + 1, 1, k + 1))
                    for sif_num, sifs in enumerate(text, start=1):
                        for a in range(1, len(re.findall("@66", sifs))):
                            links.append(
                                addlink(len(tur) + 1, sif_num, a + k + 1))
                #if localbet_yosef - len(karo[len(tur)+1]) != -1:
                #print simans.group(2),roman_siman,  localbet_yosef, len(karo[len(tur)+1])
                #  pass
                if roman_siman - older_siman != 1:
                    print siman
                    print roman_siman
                older_siman = roman_siman
                text = re.sub(ur'@66',
                              lambda m, c=count(1): '[{}]'.format(next(c)),
                              text)
                tur.append([text])
            except IndexError:
                print "out of index"

    arbaturim.append(tur)
    depth = lambda L: isinstance(L, list) and max(map(depth, L)) + 1
    print depth(tur)
    return tur
Пример #50
0
def parse1(text):
    old_numeri = 0
    if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(
            masechet)) or os.path.isfile(
                'source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
        nose_kelim = nosekelim.open_file()
        fixed = nosekelim.parse(nose_kelim)
        links_netanel = []
        netanel = 0
    rosh = []
    chapters = re.split(ur'(?:@00|@99)([^@]*)', text)
    for chapter_num, chapter in zip(chapters[1::2], chapters[2::2]):
        mispar = chapter_num.strip().split(" ")[1]
        if mispar.encode('utf-8') in misparim.keys():
            mispar_numeri = misparim[mispar.encode('utf-8')]
            print mispar_numeri
            if mispar_numeri - old_numeri > 1:
                for i in range(1, mispar_numeri - old_numeri):
                    rosh.append([])
                    #print "length of rosh", len(rosh)
            old_numeri = mispar_numeri
        print mispar
        #if len(chapter)<=1:
        #   pass
        #else:
        perek = []
        a = re.split(ur'@22([^@]*)', chapter)
        for seif, cont in zip(a[1::2], a[2::2]):
            si = []
            korban = []
            #print seif
            if ur'[*]' in seif and (
                    os.path.isfile(
                        'source/Korban_Netanel_on_{}.txt'.format(masechet))
                    or os.path.isfile(
                        'source/Pilpula_Charifta_on_{}.txt'.format(masechet))
            ) and netanel < len(fixed):
                # print "hello", seif, netanel, len(fixed)
                if os.path.isfile(
                        'source/Korban_Netanel_on_{}.txt'.format(masechet)):
                    commentator = "Korban Netanel"
                if os.path.isfile(
                        'source/PilPula_Charifta_on_{}.txt'.format(masechet)):
                    commentator = "Pilpula Charifta"
                korban.append(fixed[netanel])
                roash = "Rosh on %s." % masechet + str(
                    len(rosh) + 1) + "." + str(len(perek) + 1) + ".1"
                netanelink = commentator + " on " + masechet + "." + str(
                    len(links_netanel) + 1) + ".1"
                #print roash, netanelink
                links.append(link(netanelink, roash))
                netanel += 1
            content = re.split('@66', cont)
            seif = re.sub(ur'[^א-ת]', "", seif)
            seif = hebrew.heb_string_to_int(seif.strip())
            for num, co in enumerate(content):
                a = re.findall('\[\*\]', co)
                for b in a:
                    #   print b, seif
                    if (os.path.isfile(
                            'source/Korban_netanel_on_{}.txt'.format(masechet))
                            or os.path.isfile(
                                'source/Pilpula_Charifta_on_{}.txt'.format(
                                    masechet))) and netanel < len(fixed):
                        if os.path.isfile(
                                'source/Korban_netanel_on_{}.txt'.format(
                                    masechet)):
                            commentator = "Korban Netanel "
                        if os.path.isfile(
                                'source/Pilpula_Charifta_on_{}.txt'.format(
                                    masechet)):
                            commentator = "Pilpula Charifta "
                        korban.append(fixed[netanel])
                        roash = "Rosh on %s." % masechet + str(
                            len(rosh) + 1) + "." + str(len(perek) +
                                                       1) + "." + str(num + 1)
                        netanelink = commentator + "on " + masechet + "." + str(
                            len(links_netanel) + 1) + "." + str(len(korban))
                        #print roash, netanelink
                        links.append(link(netanelink, roash))
                        netanel += 1
                si.append(co)
            if os.path.isfile('source/Korban_Netanel_on_{}.txt'.format(
                    masechet)) or os.path.isfile(
                        'source/Pilpula_Charifta_on_{}.txt'.format(masechet)):
                links_netanel.append(korban)
            perek.append(si)
        if len(perek) is not 0:
            rosh.append(perek)