示例#1
0
 def Gemara(self, daf, gemara_in_order):
     self.maharam_line += 1
     self.gemara_line += 1
     gemara_in_order[self.gemara_line] = gemara_in_order[
         self.gemara_line].replace('0:', '')
     if gemara_in_order[self.gemara_line].find('-') >= 0:
         in_order, out_order = gemara_in_order[self.gemara_line].split('-')
     else:
         in_order = gemara_in_order[self.gemara_line]
         out_order = in_order
     masechet_daf_line_start = masechet + " " + AddressTalmud.toStr(
         "en", daf) + ":" + in_order
     masechet_daf_line_end = masechet + " " + AddressTalmud.toStr(
         "en", daf) + ":" + out_order
     try:
         masechet_daf_line = Ref(masechet_daf_line_start).to(
             Ref(masechet_daf_line_end)).normal()
     except:
         masechet_daf_line = masechet_daf_line_start
     self.links_to_post.append({
         "refs": [
             masechet_daf_line, "Maharam on " + masechet + "." +
             AddressTalmud.toStr("en", daf) + "." + str(self.maharam_line)
         ],
         "type":
         "commentary",
         "auto":
         True,
         "generated_by":
         "Maharam on " + masechet + " linker",
     })
示例#2
0
def post(text, dh_dict, tractate):
    text_array = convertDictToArray(text)
    send_text = {
        "text": text_array,
        "versionTitle": "Ramban on Talmud",
        "versionSource": "http://www.sefaria.org",
        "language": "he"
    }
    post_text("Chiddushei Ramban on " + tractate, send_text)
    links_to_post = []
    daf_array = get_text_plus(tractate)['he']
    match = Match(in_order=True,
                  min_ratio=80,
                  guess=False,
                  range=True,
                  can_expand=False)
    for daf in sorted(dh_dict.keys()):
        dh_list = dh_dict[daf]
        results = match.match_list(
            dh_list, daf_array[daf - 1],
            tractate + " " + AddressTalmud.toStr("en", daf))
        for key, value in results.iteritems():
            value = value.replace("0:", "")
            talmud_end = tractate + "." + AddressTalmud.toStr(
                "en", daf) + "." + value
            ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr(
                "en", daf) + "." + str(key)
            links_to_post.append({
                'refs': [talmud_end, ramban_end],
                'type': 'commentary',
                'auto': 'True',
                'generated_by': "ramban" + tractate
            })
    post_link(links_to_post)
示例#3
0
def post(text, dh_dict, tractate):
    text_array = convertDictToArray(text)
    send_text = {
        "text": text_array,
        "versionTitle": "Chiddushei HaRamban, Jerusalem 1928-29",
        "versionSource":
        "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001294828",
        "language": "he"
    }
    post_text("Chiddushei Ramban on " + tractate, send_text)
    links_to_post = []

    for daf in sorted(dh_dict.keys()):
        dh_list = dh_dict[daf]
        daf_text = Ref(tractate + " " +
                       AddressTalmud.toStr("en", daf)).text('he').text
        results = match.match_list(
            dh_list, daf_text, tractate + " " + AddressTalmud.toStr("en", daf))
        for key, value in results.iteritems():
            value = value.replace("0:", "")
            talmud_end = tractate + "." + AddressTalmud.toStr(
                "en", daf) + "." + value
            talmud_end = tractate + "." + AddressTalmud.toStr(
                "en", daf) + "." + value
            ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr(
                "en", daf) + "." + str(key)
            links_to_post.append({
                'refs': [talmud_end, ramban_end],
                'type': 'commentary',
                'auto': 'True',
                'generated_by': "ramban" + tractate
            })
    post_link(links_to_post)
示例#4
0
def match_and_link(dhs, masechet):
    def base_tokenizer(str):
        str = re.sub(ur"\([^\(\)]+\)", u"", str)
        word_list = re.split(ur"\s+", str)
        word_list = [w for w in word_list if w]  # remove empty strings
        return word_list

    links = []

    for daf in dhs:
        talmud_text = TextChunk(Ref(masechet + "." +
                                    AddressTalmud.toStr("en", daf)),
                                lang="he")
        result = match_ref(talmud_text,
                           dhs[daf],
                           base_tokenizer=base_tokenizer,
                           create_ranges=True)['matches']
        if result != [None]:
            for count, line in enumerate(result):
                assert line is not None
                Ritva_end = "Ritva on " + masechet + "." + str(
                    AddressTalmud.toStr("en", daf)) + "." + str(count + 1)
                talmud_end = line.normal()
                links.append({
                    'refs': [Ritva_end, talmud_end],
                    'type': 'commentary',
                    'auto': 'True',
                    'generated_by': masechet + "Ritva"
                })
    post_link(links)
def print_out_refs(daf, line, segment, prev_daf, prev_line, prev_segment):
    second = "{} {}:{}:{}".format(title, AddressTalmud.toStr("en", daf + 1),
                                  line + 1, segment + 1)
    first = "{} {}:{}:{}".format(title,
                                 AddressTalmud.toStr("en", prev_daf + 1),
                                 prev_line + 1, prev_segment + 1)
    print "First: {}".format(first)
    print "Second: {}\n".format(second)
示例#6
0
def compileCommentaryIntoPage(title, daf):
    page = []
    ref = Ref(title+" "+AddressTalmud.toStr("en", daf)+".1")
    while ref is not None and ref.normal().find(AddressTalmud.toStr("en", daf)) >= 0:
        text = ref.text('he').text
        for line in text:
            page.append(line)
        ref = ref.next_section_ref() if ref.next_section_ref() != ref else None
    return page
示例#7
0
def find_misshing_DH(max_length):

    """
    Run through Ritva Makkot, and search for lines with an unreasonable amount of words until the first period.
    :param max_length:
    :return:
    """
    text={}
    count, lines = 0, 0
    curr_daf=0
    probs = codecs.open('probs_ritva.txt', 'w', 'utf-8')
    files = ["chiddushei one.txt","chiddushei two.txt", "chiddushei three.txt", "chiddushei four.txt", "chiddushei five.txt"]
    for file in files:
        open_file = codecs.open(file, 'r', 'utf-8')
        for line in open_file:
            line = line.replace('\n','')
            if len(line)==0:
                continue
            if line.find(u"#")>=0:
                start=line.find(u"#1")
                end=line.find(u"#2")
                if start>end or start==-1 or end==-1:
                    print '# error'
                daf = line[start:end]
                if daf.find(u'ע"ב')>=0:
                    curr_daf += 1
                elif daf.find(u'דף')>=0:
                    daf = daf.split(u" ")[1]
                    poss_daf = 2*getGematria(daf)-1
                    if poss_daf < curr_daf:
                        print 'daf error'
                    curr_daf = poss_daf
                else:
                    print 'no daf'
            line = line.replace('@1','').replace('@2','')
            words = line.split()

            for index, word in enumerate(words):

                lines += 1

                if word.find(u'.') >= 0:
                    break

                elif index > max_length:
                    probs.write('file: ' + str(file) + "\n")
                    probs.write('current daf:' + AddressTalmud.toStr('en', curr_daf) + "\n")
                    probs.write('line without DH:\t' + ' '.join(words[:max_length]) + "\n\n\n")
                    count += 1
                    break

            else:
                probs.write(u'file: ' + str(file) + u"\n")
                probs.write(u'current daf:' + AddressTalmud.toStr('en', curr_daf) + u"\n")
                probs.write(u'line without DH:\t' + u' '.join(words) + u"\n\n\n")
                count += 1
    print count, lines
示例#8
0
def compileCommentaryIntoPage(title, daf):
    page = []
    ref = Ref(title+" "+AddressTalmud.toStr("en", daf)+".1")
    while ref is not None and ref.normal().find(AddressTalmud.toStr("en", daf)) >= 0:
        text = ref.text('he').text
        for line in text:
            page.append(line)
        ref = ref.next_section_ref() if ref.next_section_ref() != ref else None
    return page
示例#9
0
def compileCommentaryIntoPage(title, daf):
    page = []
    next = title + " " + AddressTalmud.toStr("en", daf) + ".1"
    while next is not None and next.find(AddressTalmud.toStr("en", daf)) >= 0:
        text = get_text_plus(next)
        for line in text['he']:
            page.append(line)

        next = text['next']
    return page
def compileCommentaryIntoPage(title, daf):
    page = []
    next = title+" "+AddressTalmud.toStr("en", daf)+".1"
    while next is not None and next.find(AddressTalmud.toStr("en", daf)) >= 0:
        text = get_text_plus(next)
        for line in text['he']:
            page.append(line)

        next = text['next']
    return page
示例#11
0
def match_and_link(dhs, masechet):
    match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False)
    links = []
    for daf in dhs:
        talmud_text = get_text_plus(masechet+"."+AddressTalmud.toStr("en", daf))['he']
        result = match.match_list(dhs[daf], talmud_text)
        for line in result:
            talmud_range = result[line].replace("0:", "")
            Ritva_end = "Ritva on "+masechet+"."+str(AddressTalmud.toStr("en", daf))+"."+str(line)
            talmud_end = masechet + "." + AddressTalmud.toStr("en", daf) + "." + talmud_range
            links.append({'refs': [Ritva_end, talmud_end], 'type': 'commentary', 'auto': 'True', 'generated_by': masechet+"Ritva"})
    post_link(links)
示例#12
0
def lookForLineInCommentary(title, daf, line_n):
    total_count = 0
    next = title + " " + AddressTalmud.toStr("en", daf) + ":1"
    while next.find(AddressTalmud.toStr("en", daf)) >= 0:
        text = get_text_plus(next)
        local_count = 0
        for line in text['he']:
            local_count += 1
            total_count += 1
            if total_count == line_n:
                return next + "." + str(local_count)
        next = text['next']
    return ""
示例#13
0
def lookForLineInCommentary(title, daf, line_n):
    total_count = 0
    ref = Ref(title+" "+AddressTalmud.toStr("en", daf)+":1")
    while ref is not None and ref.normal().find(AddressTalmud.toStr("en", daf)) >= 0:
        text = ref.text('he').text
        local_count = 0
        for line in text:
            local_count+=1
            total_count+=1
            if total_count == line_n:
                return ref.normal()+"."+str(local_count)
        ref = ref.next_section_ref() if ref.next_section_ref() != ref else None
    return ""
示例#14
0
def lookForLineInCommentary(title, daf, line_n):
    total_count = 0
    ref = Ref(title+" "+AddressTalmud.toStr("en", daf)+":1")
    while ref is not None and ref.normal().find(AddressTalmud.toStr("en", daf)) >= 0:
        text = ref.text('he').text
        local_count = 0
        for line in text:
            local_count+=1
            total_count+=1
            if total_count == line_n:
                return ref.normal()+"."+str(local_count)
        ref = ref.next_section_ref() if ref.next_section_ref() != ref else None
    return ""
def lookForLineInCommentary(title, daf, line_n):
    total_count = 0
    next = title+" "+AddressTalmud.toStr("en", daf)+":1"
    while next.find(AddressTalmud.toStr("en", daf)) >= 0:
        text = get_text_plus(next)
        local_count = 0
        for line in text['he']:
            local_count+=1
            total_count+=1
            if total_count == line_n:
                return next+"."+str(local_count)
        next = text['next']
    return ""
示例#16
0
    def postLinks(self):
        def base_tokenizer(str):
            str = re.sub(ur"\([^\(\)]+\)", u"", str)
            word_list = re.split(ur"\s+", str)
            word_list = [w for w in word_list if w]  # remove empty strings
            return word_list

        mishnah_in_order = {}
        mishnah_out_order = {}
        links_to_post = []
        for daf in sorted(self.dh1_dict.keys()):
            if daf < 179:
                continue
            print daf
            self.maharam_line = 0
            self.rashi_line = -1
            self.tosafot_line = -1
            self.gemara_line = -1
            mishnah_line = 0
            tosafot1_arr = self.tosafot1_dict[daf]
            rashi1_arr = self.rashi1_dict[daf]
            gemara1_arr = self.gemara1_dict[daf]
            print "matching tosafot"+str(len(tosafot1_arr))
            tosafot_text = Ref("Tosafot on "+masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
            tosafot1_arr = [text.decode('utf-8') for text in tosafot1_arr]
            tosafot_in_order = match_ref(tosafot_text, tosafot1_arr, base_tokenizer, self.dh_extract_method, verbose=True)
            tosafot_in_order = self.convertToOldFormat(tosafot_in_order)
            if not (masechet == "Bava Batra" and daf > 57):
                print "matching rashi"+str(len(rashi1_arr))
                rashi_text = Ref("Rashi on "+masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
                rashi1_arr = [text.decode('utf-8') for text in rashi1_arr]
                rashi_in_order = match_ref(rashi_text, rashi1_arr, base_tokenizer, self.dh_extract_method, verbose=True)
                rashi_in_order = self.convertToOldFormat(rashi_in_order)
            print "matching gemara"+str(len(gemara1_arr))
            gemara_text = Ref(masechet+" "+AddressTalmud.toStr("en", daf)).text('he')
            gemara1_arr = [text.decode('utf-8') for text in gemara1_arr]
            gemara_in_order = match_ref(gemara_text, gemara1_arr, base_tokenizer, self.dh_extract_method, verbose=True)
            gemara_in_order = self.convertToOldFormat(gemara_in_order)
            dh1_arr = self.dh1_dict[daf]
            print "done matching"
            for category, dh in self.dh1_dict[daf]:
                print category
                if category == 'rashi' or category == 'tosafot':
                    self.RashiOrTosafot(daf, category, rashi_in_order, tosafot_in_order)
                elif category == 'gemara':
                    self.Gemara(daf, gemara_in_order)
                #elif category == "mishnah":
                #    self.Mishnah(daf, mishnah_in_order)
                elif category == 'paragraph' and self.maharam_line == 0:
                    self.maharam_line+=1
        post_link(self.links_to_post)
示例#17
0
 def Gemara(self, daf, results):
     self.maharam_line+=1
     self.which_line['gemara']+=1
     if results['gemara'][self.which_line['gemara']] == '0':
         self.missing_ones.append(self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line))
     else:
         self.links_to_post.append({
         "refs": [
                  results['gemara'][self.which_line['gemara']],
                 self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
             ],
         "type": "commentary",
         "auto": True,
         "generated_by": self.title+self.masechet+" linker",
      })
示例#18
0
 def Gemara(self, daf, gemara_in_order):
     self.maharam_line+=1
     self.gemara_line+=1
     if gemara_in_order[self.gemara_line] == '0':
         self.missing_ones.append("Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line))
     else:
         self.links_to_post.append({
         "refs": [
                  gemara_in_order[self.gemara_line],
                 "Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
             ],
         "type": "commentary",
         "auto": True,
         "generated_by": "Maharam on "+masechet+" linker",
      })
示例#19
0
 def getTC(self, category, daf, masechet):
     if category == "tosafot":
         return Ref("Tosafot on " + masechet + "." +
                    AddressTalmud.toStr("en", daf)).text('he')
     elif category == "gemara":
         return Ref(masechet + " " +
                    AddressTalmud.toStr("en", daf)).text('he')
     elif category == "rashi":
         rashi = Ref("Rashi on " + masechet + "." +
                     AddressTalmud.toStr("en", daf)).text('he')
         if len(rashi.text) == 0:
             return Ref("Rashbam on " + masechet + "." +
                        AddressTalmud.toStr("en", daf)).text('he')
         else:
             return rashi
示例#20
0
 def Gemara(self, daf, results):
     self.maharam_line+=1
     self.which_line['gemara']+=1
     if results['gemara'][self.which_line['gemara']] == '0':
         self.missing_ones.append(self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line))
     else:
         self.links_to_post.append({
         "refs": [
                  results['gemara'][self.which_line['gemara']],
                 self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
             ],
         "type": "commentary",
         "auto": True,
         "generated_by": self.title+self.masechet+" linker",
      })
def getLog(siman, result, dh_dict, comm):
    log = []
    for key in result:
        line_n = result[key]
        if line_n[0] == 0:
            append_str = (
                "did not find dh:\n"
                + str(dh_dict[siman][key - 1])
                + "\n in "
                + title_book
                + ", Daf "
                + AddressTalmud.toStr("en", siman)
                + ":"
            )
            append_str += "\nwww.sefaria.org/" + title_book.replace(" ", "_") + "." + AddressTalmud.toStr("en", siman)
            append_str += "\ntext:<b>" + str(dh_dict[siman][key - 1]) + ".</b> " + str(comm[siman][key - 1]) + "\n\n"
            log.append(append_str)
        elif len(line_n) > 1:
            bestGuess = line_n[0]
            guess_str = (
                "looked for dh:\n"
                + str(dh_dict[siman][key - 1])
                + "\n in "
                + title_book
                + ", Daf "
                + AddressTalmud.toStr("en", siman)
            )
            guess_str += " and guessed the dh matches to line " + str(bestGuess) + ":"
            title_c = title_comm.replace(" ", "_")
            guess_str += "\nwww.sefaria.org/" + title_c + "." + AddressTalmud.toStr("en", siman) + "." + str(bestGuess)
            guess_str += "\nbut other options include:\n"
            for guess in line_n:
                if guess != line_n[0]:
                    title = title_book.replace(" ", "_")
                    guess_str += (
                        "line "
                        + str(guess)
                        + ": www.sefaria.org/"
                        + title
                        + "."
                        + AddressTalmud.toStr("en", siman)
                        + "."
                        + str(guess)
                        + " ,\n"
                    )
            guess_str = guess_str[0:-1]
            log.append(guess_str + "\n\n")
    return log
示例#22
0
 def Commentary(self, daf, category, results):
     self.maharam_line += 1
     self.which_line[category] += 1
     title = category.title() + " on " + self.masechet
     base_ref = results[category][self.which_line[category]]
     comm_ref = self.title + " on " + self.masechet + "." + AddressTalmud.toStr(
         "en", daf) + "." + str(self.maharam_line)
     if base_ref == '0':
         self.missing_ones.append(comm_ref)
     else:
         self.links_to_post.append({
             "refs": [base_ref, comm_ref],
             "type":
             "commentary",
             "auto":
             True,
             "generated_by":
             self.title + self.masechet + " linker"
         })
         gemara_ref = self.getGemaraRef(base_ref)
         self.links_to_post.append({
             "refs": [comm_ref, gemara_ref],
             "type":
             "commentary",
             "auto":
             True,
             "generated_by":
             self.title + self.masechet + " linker"
         })
示例#23
0
 def Commentary(self, daf, category, results):
     self.maharam_line += 1
     self.which_line[category] += 1
     title = category.title() + " on " + self.masechet
     base_ref = results[category][self.which_line[category]]
     comm_ref = self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
     if base_ref == '0':
         self.missing_ones.append(comm_ref)
     else:
         self.links_to_post.append({
             "refs": [
                          base_ref,
                         comm_ref
                     ],
             "type": "commentary",
             "auto": True,
             "generated_by": self.title+self.masechet+" linker"
         })
         gemara_ref = self.getGemaraRef(base_ref)
         self.links_to_post.append({
             "refs": [
                 comm_ref,
                 gemara_ref
             ],
             "type": "commentary",
             "auto": True,
             "generated_by": self.title+self.masechet+" linker"
         })
示例#24
0
def find_matches(gemara, tosafot, rashi):
    # what needs to be done is to go through each dict and try to match everything, but check each segment that if it is בא"ד
    # ignore if it has a match and match it to previous segment's match
    # and if no match: link with previous segment (as a range) as if this comment really has no DH which is why it has no match

    nones = total = 0
    for pairs in [(tosafot, "Tosafot on Ketubot"), (gemara, "Ketubot"), (rashi, "Rashi on Ketubot")]:
        orig_dict = dict(pairs[0])
        which_dict = pairs[0]
        which_text = pairs[1]
        for daf in which_dict.keys():
            actual_daf = AddressTalmud.toStr("en", daf)
            base_text = TextChunk(Ref("{} {}".format(which_text, actual_daf)), lang='he')
            if not base_text.text:
                continue
            comments = which_dict[daf]
            results = match_ref(base_text, comments, lambda x: x.split(), dh_extract_method=dh_extract)
            for i, result_comment in enumerate(zip(results["matches"], comments)):
                result, comment = result_comment
                comment_wout_bold = comment.replace("<b>", "").replace("</b>", "")
                if u"""בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]) \
                        or u"""שם בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]):
                    results["matches"][i] = results["matches"][i - 1]
            which_dict[daf] = results["matches"]


        for daf in which_dict.keys():
             if which_dict[daf] and orig_dict[daf]:
                which_dict[daf] = create_ranges(orig_dict, which_dict, which_text, daf)
    return gemara, tosafot, rashi
示例#25
0
    def getDaf(self, line, current_daf, len_masechet, prev_line):
        prev_num = self.current_daf
        orig_line = line
        line = line.replace("@11 ", "@11")
        if line.split(" ")[0].find('דף') >= 0:
            daf_value = getGematria(
                line.split(" ")[1].replace('"', '').replace("'", ''))
            if line.split(" ")[2].find(self.amud_bet) >= 0:
                self.current_daf = 2 * daf_value
            else:
                self.current_daf = 2 * daf_value - 1
            actual_text = ""
            start_at = 3
            if line.split(" ")[2] not in ['ע"ב', 'ע"א']:
                start_at = 2
            for count, word in enumerate(line.split(" ")):
                if count >= start_at:
                    actual_text += word + " "
        else:
            self.current_daf += 1
            actual_text = line[3:]

        if self.current_daf <= prev_num:
            he_current = AddressTalmud.toStr("he", self.current_daf)
            he_prev = AddressTalmud.toStr("he", prev_num)
            #prev_line = " ".join(prev_line.split(" ")[0:5])
            #orig_line = " ".join(orig_line.split(" ")[0:5])
            print u"{} before {}\n".format(he_prev, he_current)
            self.dont_post = True
            #print u"The line starting: {} is {}\n".format(prev_line, he_prev)
            #print u"It came before the line starting {}, which is {}\n\n".format(orig_line, he_current)

        if not self.current_daf in self.dh1_dict:
            self.dh1_dict[self.current_daf] = []
            for each_cat in self.categories:
                self.dh_by_cat[each_cat][self.current_daf] = []
        self.actual_text = actual_text
        if self.current_daf > len_masechet:
            print "DAF EXTRA {} > {} in {} {}".format(self.current_daf,
                                                      len_masechet, self.title,
                                                      self.masechet)
            pass
        self.list_of_dafs.append(self.current_daf)

        return self.current_daf
示例#26
0
    def postLinks(self):
        def base_tokenizer(str):
            str = re.sub(ur"\([^\(\)]+\)", u"", str)
            word_list = re.split(ur"\s+", str)
            word_list = [w for w in word_list if w]  # remove empty strings
            return word_list

        def dh_extract_method(str):
            str = str.replace(u'בד"ה', u'').replace(u'וכו', u'')
            return str

        '''
        1. strip out "" from dhs with list comprehension
        2. make dictionary where each dh str is key and the value is its index in the array
        '''
        links = []
        for daf in self.text:
            dhs_arr = [dh for dh in self.dhs[daf] if len(dh) > 0]
            gemara_text = Ref("{} {}".format(self.tractate,
                                             AddressTalmud.toStr(
                                                 "en", daf))).text('he')
            results = match_ref(gemara_text,
                                dhs_arr,
                                base_tokenizer,
                                dh_extract_method=dh_extract_method,
                                verbose=False)['matches']
            self.makeDicts(daf)
            rashba_refs = []
            for dh in dhs_arr:
                rashba_refs.append("Rashba on {} {}.{}".format(
                    self.tractate, AddressTalmud.toStr("en", daf),
                    self.dh_dict[daf][dh] + 1))
            link_pairs = zip(rashba_refs, results)
            for link_pair in link_pairs:
                if link_pair[1]:
                    links.append({
                        "refs": [link_pair[0], link_pair[1].normal()],
                        "type":
                        "commentary",
                        "auto":
                        True,
                        "generated_by":
                        "rashba{}".format(self.tractate)
                    })
        post_link(links, server=self.server)
示例#27
0
    def getDaf(self, line, current_daf, len_masechet, prev_line):
        prev_num = self.current_daf
        orig_line = line
        line = line.replace("@11 ", "@11")
        if line.split(" ")[0].find('דף')>=0:
            daf_value = getGematria(line.split(" ")[1].replace('"', '').replace("'", ''))
            if line.split(" ")[2].find(self.amud_bet)>=0:
                self.current_daf = 2*daf_value
            else:
                self.current_daf = 2*daf_value - 1
            actual_text = ""
            start_at = 3
            if line.split(" ")[2] not in ['ע"ב', 'ע"א']:
                start_at = 2
            for count, word in enumerate(line.split(" ")):
                if count >= start_at:
                    actual_text += word + " "
        else:
            self.current_daf += 1
            actual_text = line[3:]

        if self.current_daf <= prev_num:
            he_current = AddressTalmud.toStr("he", self.current_daf)
            he_prev = AddressTalmud.toStr("he", prev_num)
            #prev_line = " ".join(prev_line.split(" ")[0:5])
            #orig_line = " ".join(orig_line.split(" ")[0:5])
            print u"{} before {}\n".format(he_prev, he_current)
            self.dont_post = True
            #print u"The line starting: {} is {}\n".format(prev_line, he_prev)
            #print u"It came before the line starting {}, which is {}\n\n".format(orig_line, he_current)


        if not self.current_daf in self.dh1_dict:
            self.dh1_dict[self.current_daf] = []
            for each_cat in self.categories:
                self.dh_by_cat[each_cat][self.current_daf] = []
        self.actual_text = actual_text
        if self.current_daf > len_masechet:
            print "DAF EXTRA {} > {} in {} {}".format(self.current_daf, len_masechet, self.title, self.masechet)
            pass
        self.list_of_dafs.append(self.current_daf)

        return self.current_daf
示例#28
0
    def RashiOrTosafot(self, daf, category, results):
        self.maharam_line += 1
        if category == 'rashi':
            self.rashi_line += 1
            title = 'Rashi on ' + masechet
            ref = results[category][self.rashi_line]
        elif category == 'tosafot':
            self.tosafot_line += 1
            title = 'Tosafot on ' + masechet
            ref = results[category][self.tosafot_line]

        if ref == '0':
            self.missing_ones.append("Maharshal on " + masechet + "." +
                                     AddressTalmud.toStr("en", daf) + "." +
                                     str(self.maharam_line))
        else:
            self.links_to_post.append({
                "refs": [
                    ref, "Maharshal on " + masechet + "." +
                    AddressTalmud.toStr("en", daf) + "." +
                    str(self.maharam_line)
                ],
                "type":
                "commentary",
                "auto":
                True,
                "generated_by":
                "Maharshal on " + masechet + " linker"
            })
            gemara_ref = self.getGemaraRef(ref)
            self.links_to_post.append({
                "refs": [
                    "Maharshal on " + masechet + "." +
                    AddressTalmud.toStr("en", daf) + "." +
                    str(self.maharam_line), gemara_ref
                ],
                "type":
                "commentary",
                "auto":
                True,
                "generated_by":
                "Maharshal on " + masechet + " linker"
            })
示例#29
0
def create_link_text(source_index, line_number, comment_number):
    amud_number = AddressTalmud.toStr('en', source_index)
    return {
        "refs": [
            "Sanhedrin {}.{}".format(amud_number, line_number),
            "Yad Ramah on Sanhedrin {}.{}".format(amud_number, comment_number)
        ],
        "type":
        "commentary",
    }
示例#30
0
def post(text, dh_dict, tractate):
     text_array = convertDictToArray(text)
     send_text = {
         "text": text_array,
         "versionTitle": "Ramban on Talmud",
         "versionSource": "http://www.sefaria.org",
         "language": "he"
     }
     post_text("Chiddushei Ramban on "+tractate, send_text)
     links_to_post = []
     daf_array = get_text_plus(tractate)['he']
     match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False)
     for daf in sorted(dh_dict.keys()):
         dh_list = dh_dict[daf]
         results = match.match_list(dh_list, daf_array[daf-1], tractate+" "+AddressTalmud.toStr("en", daf))
         for key, value in results.iteritems():
             value = value.replace("0:", "")
             talmud_end = tractate + "." + AddressTalmud.toStr("en", daf) + "." + value
             ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr("en", daf) + "." + str(key)
             links_to_post.append({'refs': [talmud_end, ramban_end], 'type': 'commentary', 'auto': 'True', 'generated_by': "ramban"+tractate})
     post_link(links_to_post)
示例#31
0
def get_matches_for_dict_and_link(dh_dict, base_text_title, commentary_title, talmud=True, lang='he', word_threshold=0.27, server="", rashi_filter=None, dh_extract_method=lambda x: x):
    def base_tokenizer(str):
        str_list = str.split(" ")
        return [str for str in str_list if len(str) > 0]


    assert len(server) > 0, "Please specify a server"
    results = {}
    links = []
    matched = 0
    total = 0
    for daf in dh_dict:
        print daf
        dhs = dh_dict[daf]
        if talmud:
            base_text_ref = "{} {}".format(base_text_title, AddressTalmud.toStr("en", daf))
            comm_ref = "{} on {} {}".format(commentary_title, base_text_title, AddressTalmud.toStr("en", daf))
        else:
            base_text_ref = "{} {}".format(base_text_title, daf)
            comm_ref = "{} on {} {}".format(commentary_title, base_text_title, daf)
        base_text = TextChunk(Ref(base_text_ref), lang=lang)
        comm_text = TextChunk(Ref(comm_ref), lang=lang)
        results[daf] = match_ref(base_text, comm_text, base_tokenizer=base_tokenizer, word_threshold=word_threshold, rashi_filter=rashi_filter, dh_extract_method=dh_extract_method)["matches"]
        for count, link in enumerate(results[daf]):
            if link:
                base_end = link.normal()
                comm_end = "{} on {} {}:{}".format(commentary_title, base_text_title, AddressTalmud.toStr("en", daf), count+1)
                links.append({
                    "refs": [base_end, comm_end],
                    "auto": True,
                    "type": "commentary",
                    "generated_by": commentary_title+base_text_title
                })
                matched += 1
            total += 1
    print "Matched: {}".format(matched)
    print "Total {}".format(total)
    post_link(links, server=server)

    return results
示例#32
0
 def Rosh(self, perek, daf, dh, results):
     self.maharam_line += 1
     self.rosh_line += 1
     if results[perek-1][self.rosh_line]:
         self.links_to_post.append({
             "refs": [
                      results[perek-1][self.rosh_line].normal(),
                     self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
                 ],
             "type": "commentary",
             "auto": True,
             "generated_by": self.title+self.masechet+" linker",
          })
示例#33
0
 def RashiOrTosafot(self, daf, category, rashi_in_order, tosafot_in_order):
     if category == 'rashi':
         self.maharam_line+=1
         self.rashi_line+=1
         title = 'Rashi on '+masechet
         in_order = rashi_in_order[self.rashi_line]
     elif category == 'tosafot':
         self.maharam_line+=1
         self.tosafot_line+=1
         title = 'Tosafot on '+masechet
         in_order = tosafot_in_order[self.tosafot_line]
     if in_order == '0':
         self.missing_ones.append("Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line))
     else:
         self.links_to_post.append({
             "refs": [
                          in_order,
                         "Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
                     ],
             "type": "commentary",
             "auto": True,
             "generated_by": "Maharam on "+masechet+" linker"})
示例#34
0
 def Rosh(self, perek, daf, dh, results):
     self.maharam_line += 1
     self.rosh_line += 1
     if results[perek-1][self.rosh_line]:
         self.links_to_post.append({
             "refs": [
                      results[perek-1][self.rosh_line].normal(),
                     self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
                 ],
             "type": "commentary",
             "auto": True,
             "generated_by": self.title+self.masechet+" linker",
          })
示例#35
0
def post(text, dh_dict, tractate):
     text_array = convertDictToArray(text)
     send_text = {
         "text": text_array,
         "versionTitle": "Chiddushei HaRamban, Jerusalem 1928-29",
         "versionSource": "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001294828",
         "language": "he"
     }
     post_text("Chiddushei Ramban on "+tractate, send_text)
     links_to_post = []

     for daf in sorted(dh_dict.keys()):
         dh_list = dh_dict[daf]
         daf_text = Ref(tractate+" "+AddressTalmud.toStr("en", daf)).text('he').text
         results = match.match_list(dh_list, daf_text, tractate+" "+AddressTalmud.toStr("en", daf))
         for key, value in results.iteritems():
             value = value.replace("0:", "")
             talmud_end = tractate + "." + AddressTalmud.toStr("en", daf) + "." + value
             talmud_end = tractate + "." + AddressTalmud.toStr("en", daf) + "." + value
             ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr("en", daf) + "." + str(key)
             links_to_post.append({'refs': [talmud_end, ramban_end], 'type': 'commentary', 'auto': 'True', 'generated_by': "ramban"+tractate})
     post_link(links_to_post)
示例#36
0
def getLog(siman, result, dh_dict, comm):
	log = []
	for key in result:
		line_n = result[key]
		if line_n[0] == 0:
			append_str = "did not find dh:\n"+str(dh_dict[siman][key-1])+"\n in "+title_book+", Daf "+AddressTalmud.toStr("en", siman)+":"
			append_str += "\nwww.sefaria.org/"+title_book.replace(" ", "_")+"."+AddressTalmud.toStr("en", siman)
			append_str += "\ntext:<b>"+str(dh_dict[siman][key-1])+".</b> "+str(comm[siman][key-1])+"\n\n"
			log.append(append_str)
		elif len(line_n) > 1:
			bestGuess = line_n[0]
			guess_str = "looked for dh:\n"+str(dh_dict[siman][key-1])+"\n in "+title_book+", Daf "+AddressTalmud.toStr("en", siman)
			guess_str += " and guessed the dh matches to line "+str(bestGuess)+":"
			title_c = title_comm.replace(" ", "_")
			guess_str += "\nwww.sefaria.org/"+title_c+"."+AddressTalmud.toStr("en", siman)+"."+str(bestGuess)
			guess_str += "\nbut other options include:\n"
			for guess in line_n:
				if guess != line_n[0]:
					title = title_book.replace(" ", "_")
					guess_str += "line " +str(guess)+": www.sefaria.org/"+title+"."+AddressTalmud.toStr("en", siman)+"."+str(guess)+" ,\n"
			guess_str = guess_str[0:-1]
			log.append(guess_str+"\n\n")
	return log
示例#37
0
 def postLinks(self):
     def base_tokenizer(str):
         str = re.sub(ur"\([^\(\)]+\)", u"", str)
         word_list = re.split(ur"\s+", str)
         word_list = [w for w in word_list if w]  # remove empty strings
         return word_list
     def dh_extract_method(str):
         str = str.replace(u'בד"ה', u'').replace(u'וכו', u'')
         return str
     '''
     1. strip out "" from dhs with list comprehension
     2. make dictionary where each dh str is key and the value is its index in the array
     '''
     links = []
     for daf in self.text:
         dhs_arr = [dh for dh in self.dhs[daf] if len(dh) > 0]
         gemara_text = Ref("{} {}".format(self.tractate, AddressTalmud.toStr("en", daf))).text('he')
         results = match_ref(gemara_text, dhs_arr, base_tokenizer, dh_extract_method=dh_extract_method, verbose=False)['matches']
         self.makeDicts(daf)
         rashba_refs = []
         for dh in dhs_arr:
             rashba_refs.append("Rashba on {} {}.{}".format(self.tractate, AddressTalmud.toStr("en", daf), self.dh_dict[daf][dh]+1))
         link_pairs = zip(rashba_refs, results)
         for link_pair in link_pairs:
             if link_pair[1]:
                 links.append(
                 {
                 "refs": [
                              link_pair[0],
                             link_pair[1].normal()
                         ],
                 "type": "commentary",
                 "auto": True,
                 "generated_by": "rashba{}".format(self.tractate)
                 }
                 )
     post_link(links, server=self.server)
示例#38
0
def match_and_link(dhs, masechet):
    match = Match(in_order=True,
                  min_ratio=80,
                  guess=False,
                  range=True,
                  can_expand=False)
    links = []
    for daf in dhs:
        talmud_text = get_text_plus(masechet + "." +
                                    AddressTalmud.toStr("en", daf))['he']
        result = match.match_list(dhs[daf], talmud_text)
        for line in result:
            talmud_range = result[line].replace("0:", "")
            Ritva_end = "Ritva on " + masechet + "." + str(
                AddressTalmud.toStr("en", daf)) + "." + str(line)
            talmud_end = masechet + "." + AddressTalmud.toStr(
                "en", daf) + "." + talmud_range
            links.append({
                'refs': [Ritva_end, talmud_end],
                'type': 'commentary',
                'auto': 'True',
                'generated_by': masechet + "Ritva"
            })
    post_link(links)
示例#39
0
def match_and_link(text, masechet):
	match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False)
	for daf_count, daf in enumerate(text):
		dhs = []
		comments = []
		for each_line in daf:
			if each_line.find("כו'") >= 0:
				dh, comment = each_line.split("כו'", 1)
			elif each_line.find(".") >= 0:
				dh, comment = each_line.split(".", 1)
			else:
				dh, comment = splitText(each_line, 10)
			dhs.append(dh)
			comments.append(comment)
		pdb.set_trace()
		talmud_text = get_text_plus(masechet+"."+AddressTalmud.toStr("en", daf_count+3))['he']
		result = match.match_list(dhs, talmud_text)
示例#40
0
 def getTC(self, category, daf, masechet):
     if category in ["tosafot", "ran", "rosh"]:
         title = "{} on {}".format(category.title(), masechet)
         return Ref(title+"."+AddressTalmud.toStr("en", daf)).text('he')
     elif category == "gemara":
         return Ref(masechet+" "+AddressTalmud.toStr("en", daf)).text('he')
     elif category == "rashi":
         rashi = Ref("Rashi on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
         if len(rashi.text) == 0:
             print "rashbam by default {} {}".format(masechet, AddressTalmud.toStr("en", daf))
             return Ref("Rashbam on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
         else:
             return rashi
     elif category == "rashbam":
         print "rashbam {} {}".format(masechet, AddressTalmud.toStr("en", daf))
         return Ref("Rashbam on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
示例#41
0
 def getTC(self, category, daf, masechet):
     if category in ["tosafot", "ran", "rosh"]:
         title = "{} on {}".format(category.title(), masechet)
         return Ref(title+"."+AddressTalmud.toStr("en", daf)).text('he')
     elif category == "gemara":
         return Ref(masechet+" "+AddressTalmud.toStr("en", daf)).text('he')
     elif category == "rashi":
         rashi = Ref("Rashi on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
         if len(rashi.text) == 0:
             print "rashbam by default {} {}".format(masechet, AddressTalmud.toStr("en", daf))
             return Ref("Rashbam on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
         else:
             return rashi
     elif category == "rashbam":
         print "rashbam {} {}".format(masechet, AddressTalmud.toStr("en", daf))
         return Ref("Rashbam on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
示例#42
0
 def Mishnah(self, daf, mishnah_in_order):
     self.maharam_line += 1
     mishnah_line += 1
     pos = 0
     for perek in self.mishnah1_dict:
         for key in mishnah_in_order[perek]:
             pos += 1
             if pos == mishnah_line:
                 mishnah_in_order[perek][key] = mishnah_in_order[perek][
                     key].replace('0:', '')
                 if mishnah_in_order[perek][key].find('-') >= 0:
                     in_order, out_order = mishnah_in_order[perek][
                         key].split('-')
                 else:
                     in_order = mishnah_in_order[perek][key]
                     out_order = in_order
                 in_order = int(in_order)
                 out_order = int(out_order)
                 masechet_daf_line_start = "Mishnah " + masechet + "." + str(
                     perek) + "." + str(mishnah_in_order[perek][key][0])
                 masechet_daf_line_end = "Mishnah " + masechet + "." + str(
                     perek) + "." + str(mishnah_out_order[perek][key][0])
                 try:
                     masechet_daf_line = Ref(masechet_daf_line_start).to(
                         Ref(masechet_daf_line_end)).normal()
                 except:
                     masechet_daf_line = masechet_daf_line_start
                 self.links_to_post.append({
                     "refs": [
                         masechet_daf_line, "Maharam " + masechet + "." +
                         AddressTalmud.toStr("en", daf) + "." +
                         str(self.maharam_line)
                     ],
                     "type":
                     "commentary",
                     "auto":
                     True,
                     "generated_by":
                     "Maharam on " + masechet + " linker",
                 })
示例#43
0
def find_matches(gemara, tosafot, rashi):
    # what needs to be done is to go through each dict and try to match everything, but check each segment that if it is בא"ד
    # ignore if it has a match and match it to previous segment's match
    # and if no match: link with previous segment (as a range) as if this comment really has no DH which is why it has no match

    nones = total = 0
    for pairs in [(tosafot, "Tosafot on Ketubot"), (gemara, "Ketubot"),
                  (rashi, "Rashi on Ketubot")]:
        orig_dict = dict(pairs[0])
        which_dict = pairs[0]
        which_text = pairs[1]
        for daf in which_dict.keys():
            actual_daf = AddressTalmud.toStr("en", daf)
            base_text = TextChunk(Ref("{} {}".format(which_text, actual_daf)),
                                  lang='he')
            if not base_text.text:
                continue
            comments = which_dict[daf]
            results = match_ref(base_text,
                                comments,
                                lambda x: x.split(),
                                dh_extract_method=dh_extract)
            for i, result_comment in enumerate(
                    zip(results["matches"], comments)):
                result, comment = result_comment
                comment_wout_bold = comment.replace("<b>",
                                                    "").replace("</b>", "")
                if u"""בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]) \
                        or u"""שם בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]):
                    results["matches"][i] = results["matches"][i - 1]
            which_dict[daf] = results["matches"]

        for daf in which_dict.keys():
            if which_dict[daf] and orig_dict[daf]:
                which_dict[daf] = create_ranges(orig_dict, which_dict,
                                                which_text, daf)
    return gemara, tosafot, rashi
示例#44
0
def create_links(sanhedrin_ja, yad_ramah_ja):
    list_of_links = []
    amud_number = 1
    match_object = Match(in_order=True,
                         min_ratio=80,
                         guess=False,
                         range=False,
                         can_expand=True)
    for amud_of_sanhedrin, amud_yad_ramah in zip(sanhedrin_ja, yad_ramah_ja):
        ref = 'Sanhedrin {}'.format(AddressTalmud.toStr('en', amud_number))
        the_first_few_words = take_the_first_few_words_of_each_paragraph(
            amud_yad_ramah)
        matches_dict = match_object.match_list(the_first_few_words,
                                               amud_of_sanhedrin, ref)
        for key in matches_dict:
            for match in matches_dict[key]:
                if match != 0:
                    # print'Amud: {} comment: {} corresponds to {}'.format(AddressTalmud.toStr('en', amud_number), key, match)
                    print create_link_text(amud_number, match, key)
                    list_of_links.append(
                        create_link_text(amud_number, match, key))
        amud_number += 1

    return list_of_links
示例#45
0
            pdb.set_trace()
        before_dh = ""
        just_added_dh = False
        prev_line = line
        temp_text = ""

for daf in comm_dict.keys():
    if daf not in dh_dict.keys():
        pdb.set_trace()
        send_text = {
            "versionTitle": "Rashba on Bava Batra",
            "versionSource": "http://www.sefaria.org",
            "language": "en",
            "text": comm_dict[daf],
        }
        post_text("Rashba on Bava Batra." + AddressTalmud.toStr("en", daf),
                  send_text)
result = {}
guess = 0
no_guess = 0
for daf in dh_dict.keys():
    text = get_text("Bava Batra." + AddressTalmud.toStr("en", daf))
    try:
        match_obj = Match(in_order=True,
                          min_ratio=70,
                          guess=False,
                          range=True,
                          maxLine=len(text) - 1)
    except:
        pdb.set_trace()
    dh_arr = []
示例#46
0
         text_dict[perek] = convertDictToArray(text_dict[perek],
                                               empty="")
 links = []
 send_text = {
     "text": convertDictToArray(text_dict),
     "versionTitle":
     "Senlake edition 2019 based on Ben Yehoyada, Jerusalem, 1897",
     "versionSource":
     "http://beta.nli.org.il/he/books/NNL_ALEPH001933802/NLIl",
     "language": "he"
 }
 post_text("Ben Yehoyada on {}".format(title),
           send_text,
           index_count="on")
 for daf, text in text_dict.items():
     daf = AddressTalmud.toStr("en", daf) if title != "Eduyot" else daf
     try:
         base = TextChunk(Ref("{} {}".format(title, daf)), lang='he')
     except InputError as e:
         print(e)
         continue
     try:
         results = match_ref(base,
                             text,
                             lambda x: x.split(),
                             dh_extract_method=dher)
         for i, ref in enumerate(results["matches"]):
             if ref:
                 berakhot = "Ben Yehoyada on {} {}:{}".format(
                     title, daf, i + 1)
                 links.append({
示例#47
0
book = {}
total = 0
non_match = 0
guess = 0
matched = 0
log = []
dh_dict = {}
tosafot_comments = {}
prev_line = 0
for j in range(78):
    i = j + 100
    count = 0
    tosafot_comments[i + 3] = []
    dh_dict[i + 3] = []
    he_daf = u"גיטין_"
    he_daf += AddressTalmud.toStr("he", i + 3)
    he_daf = he_daf.replace(u"\u05f4", u"")
    he_daf = he_daf.replace(u"׳", u"")
    he_daf = he_daf.replace(" ", "_")
    he_daf = he_daf + ".txt"
    f = open("../Noah-Santacruz-rashiPosting/Tosafot/" + he_daf, 'r')
    for line in f:
        line = line.replace("\n", "")
        something = line.replace(" ", "")
        if len(something) > 0:
            if count % 2 == 1:
                tosafot_comments[i + 3].append(line)
                dh = line.split(".")[0]
                dh_dict[i + 3].append(dh)
            count += 1
    f.close()
comm = {}
book = {}
total = 0
non_match = 0
guess = 0
matched = 0
log = []
dh_dict = {}
rashi_comments = {}
prev_line = 0
for i in range(150): #152 
	count = 0
	rashi_comments[i+3] = []
	dh_dict[i+3] = []
	he_daf = u"עבודה זרה_"
	he_daf += AddressTalmud.toStr("he", i+3)
	he_daf = he_daf.replace(u"\u05f4", u"")
	he_daf = he_daf.replace(u"׳", u"")
	he_daf = he_daf.replace(" ", "_")
	he_daf = he_daf + ".txt"
	f = open("../Noah-Santacruz-rashiPosting/Tosafot/"+he_daf, 'r')
	for line in f:
		line = line.replace("\n", "")
		something = line.replace(" ", "")
		if len(something) > 0:
			if count % 2 == 0:
				dh_dict[i+3].append(line)
			else:
				if line.find(" - ")==-1:
					line = line.replace(".", " - ", 1)
				rashi_comments[i+3].append(line)
			comm_dict[daf].append(comm)
			if hasTags(comm) or hasTags(dh) or hasTags(before_dh):
				pdb.set_trace()
			if just_added_dh == False:
				dh_dict[daf].append("")
			just_added_dh = False
		before_dh =""
		temp_text = ""
result = {}
guess=0
no_guess=0
for daf in dh_dict.keys():
	if len(dh_dict[daf]) != len(comm_dict[daf]):
		pdb.set_trace()
for daf in dh_dict.keys():
	text = get_text("Gittin."+AddressTalmud.toStr("en", daf))
	try:
		match_obj=Match(in_order=True, min_ratio=70, guess=False, range=True, maxLine=len(text)-1)
	except:
		pdb.set_trace()
	dh_arr = []
	for i in range(len(dh_dict[daf])):
		if len(dh_dict[daf][i]) > 0:
			dh_arr.append(dh_dict[daf][i])
	result[daf] = match_obj.match_list(dh_arr, text)
	dh_count = 1
	'''
	if len(dh_dict[daf][i]) == 0, then comm_dict[daf][i] gets added to comm_dict[daf][i-1]+"<br>"
	'''
	for i in range(len(comm_dict[daf])):
		 if (daf, i) in before_dh_dict:
示例#50
0
guess = 0
matched = 0
log = []
dh_dict = {}
rashi_comments = {}
prev_line = 0

title_book = "Keritot"
title_comm = "Tosafot on Keritot"

for i in range(54):
    count = 0
    rashi_comments[i + 3] = []
    dh_dict[i + 3] = []
    he_daf = u"כריתות_"
    he_daf += AddressTalmud.toStr("he", i + 3)
    he_daf = he_daf.replace(u"\u05f4", u"")
    he_daf = he_daf.replace(u"׳", u"")
    he_daf = he_daf.replace(" ", "_")
    he_daf = he_daf + ".txt"
    f = open("../Noah-Santacruz-rashiPosting/Tosafot/" + he_daf, 'r')
    for line in f:
        line = line.replace("\n", "")
        something = line.replace(" ", "")
        if len(something) > 0:
            if count % 2 == 0:
                dh_dict[i + 3].append(line)
            else:
                if line.find(" - ") == -1:
                    line = line.replace(".", " - ", 1)
                rashi_comments[i + 3].append(line)
示例#51
0
			"versionSource": "http://www.sefaria.org/",
			"language": "he",
			"text": [comm],
			}
		post_text("Yad Ramah on Bava Batra, Perek "+str(current_perek)+", Comment "+str(comment_key), text)

match_obj=Match(in_order=True, min_ratio=80, guess=False, range=True)
skipped_arr = []
result = {}
for current_perek in range(10):
	current_perek+=1
	print current_perek
	search_for = 0
	for daf in sorted(daf_dict[current_perek].keys()):			
		print daf
		text = get_text("Bava Batra."+AddressTalmud.toStr("en", daf))
		dh_list = daf_dict[current_perek][daf]
		result[daf] = match_obj.match_list(dh_list, text, "Bava Batra "+AddressTalmud.toStr("en", daf))
		print result[daf]
		for key in result[daf]:
			if result[daf][key].find("0:") >= 0:
				result[daf][key] = result[daf][key].replace("0:","")
			search_for += 1
			line_n = result[daf][key]
			count = 0
			for comment_key in comments_order[current_perek]:
				count+=1
				if comment_key not in comm_dict[current_perek]:
					if comment_key not in skipped_arr:
						search_for+=1
						skipped_arr.append(comment_key)
book = {}
total = 0
non_match = 0
guess = 0
matched = 0
log = []
dh_dict = {}
tosafot_comments = {}
prev_line = 0
for j in range(24):  # 234
    i = j + 210
    count = 0
    tosafot_comments[i + 3] = []
    dh_dict[i + 3] = []
    he_daf = u"בבא מציעא_"
    he_daf += AddressTalmud.toStr("he", i + 3)
    he_daf = he_daf.replace(u"\u05f4", u"")
    he_daf = he_daf.replace(u"׳", u"")
    he_daf = he_daf.replace(" ", "_")
    he_daf = he_daf + ".txt"
    f = open("../Noah-Santacruz-rashiPosting/Tosafot/" + he_daf, "r")
    for line in f:
        line = line.replace("\n", "")
        something = line.replace(" ", "")
        if len(something) > 0:
            if count % 2 == 1:
                tosafot_comments[i + 3].append(line)
                dh = line.split(".")[0]
                dh_dict[i + 3].append(dh)
            count += 1
    f.close()
guess = 0
matched = 0
log = []
dh_dict = {}
rashi_comments = {}
prev_line = 0

title_book = "Keritot"
title_comm = "Rashi on Keritot"

for i in range(54):
    count = 0
    rashi_comments[i + 3] = []
    dh_dict[i + 3] = []
    he_daf = u"כריתות_"
    he_daf += AddressTalmud.toStr("he", i + 3)
    he_daf = he_daf.replace(u"\u05f4", u"")
    he_daf = he_daf.replace(u"׳", u"")
    he_daf = he_daf.replace(" ", "_")
    he_daf = he_daf + ".txt"
    f = open("../Noah-Santacruz-rashiPosting/Rashi/" + he_daf, "r")
    for line in f:
        line = line.replace("\n", "")
        something = line.replace(" ", "")
        if len(something) > 0:
            if count % 2 == 0:
                dh_dict[i + 3].append(line)
            else:
                rashi_comments[i + 3].append(line)
            count += 1
    f.close()
matched = 0
log = []
dh_dict = {}
rashi_comments = {}
prev_line = 0

title_book = "Avodah Zarah"
title_comm = "Tosafot on Avodah Zarah" 

for j in range(2): 
	i = j+
	count = 0
	rashi_comments[i+3] = []
	dh_dict[i+3] = []
	he_daf = u"עבודה_זרה_"
	he_daf += AddressTalmud.toStr("he", i+3)
	he_daf = he_daf.replace(u"\u05f4", u"")
	he_daf = he_daf.replace(u"׳", u"")
	he_daf = he_daf.replace(" ", "_")
	he_daf = he_daf + ".txt"
	f = open("../Noah-Santacruz-rashiPosting/Tosafot/"+he_daf, 'r')
	for line in f:
		line = line.replace("\n", "")
		something = line.replace(" ", "")
		if len(something) > 0:
			if count % 2 == 0:
				dh_dict[i+3].append(line)
			else:
				rashi_comments[i+3].append(line)
			count+=1
	f.close()		
		print 'line did not start with 11'

match_obj=Match(in_order=False, min_ratio=80, guess=False, range=False)

last_daf = max(comm_dict.keys())
param = "off"
for daf in comm_dict:
	if daf==last_daf:
		param = "on"
	send_text = {
				"versionTitle": "Maharam Shif on "+masechet,
				"versionSource": "http://www.sefaria.org",
				"language": "he",
				"text": comm_dict[daf],
				}
	post_text("Maharam Shif on "+masechet+"."+AddressTalmud.toStr("en", daf), send_text, param)


for category in categories:
  if category=='paragraph':
  	continue
  elif category=='gemara':
  	title = masechet
  elif category=='rashi':
  	title = "Rashi on "+masechet
  elif category=='tosafot':
  	title = "Tosafot on "+masechet
  	
  for daf in dh_dict[category]:
  	dh_arr = dh_dict[category][daf]
  	text = compileCommentaryIntoPage(title, daf)
		pdb.set_trace()

last_daf = max(comm_dict.keys())
param = "off"
text_to_post = convertDictToArray(comm_dict)
send_text = {
			"versionTitle": "Shita Mekubetzet on "+masechet,
			"versionSource": "http://www.sefaria.org",
			"language": "he",
			"text": text_to_post,
			}
post_text("Shita Mekubetzet on "+masechet, send_text, "on")

links_to_post = []
for daf in dh_dict:
	text = get_text(masechet+"."+AddressTalmud.toStr("en", daf))
	match_obj=Match(in_order=True, min_ratio=85, guess=False, range=True)
	dh_arr = dh_dict[daf]
	result = match_obj.match_list(dh_arr, text, masechet+" "+AddressTalmud.toStr("en", daf))
	for key in result:
		line_n = result[key]
		line_n = line_n.replace("0:","")
		links_to_post.append({
				"refs": [
						 masechet+"."+AddressTalmud.toStr("en", daf)+"."+line_n, 
						"Shita Mekubetzet on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(key)
					],
				"type": "commentary",
				"auto": True,
				"generated_by": "Shita on "+masechet+" linker",
			 })
示例#57
0
                comm_dict[daf] = []
            comm_dict[daf].append(comm)
            if just_added_dh == False:
                dh_dict[daf].append("")
            if hasTags(comm) or hasTags(dh) or hasTags(before_dh):
                pdb.set_trace()
        just_added_dh = False
        prev_line = line
result = {}
guess = 0
no_guess = 0
for daf in dh_dict.keys():
    if len(dh_dict[daf]) != len(comm_dict[daf]):
        pdb.set_trace()
for daf in dh_dict.keys():
    text = get_text("Niddah." + AddressTalmud.toStr("en", daf))
    try:
        match_obj = Match(in_order=True,
                          min_ratio=70,
                          guess=False,
                          range=True,
                          maxLine=len(text) - 1)
    except:
        pdb.set_trace()
    dh_arr = []
    for i in range(len(dh_dict[daf])):
        if len(dh_dict[daf][i]) > 0:
            dh_arr.append(dh_dict[daf][i])
    result[daf] = match_obj.match_list(dh_arr, text)
    dh_count = 1
    '''
示例#58
0
    def postLinks(self, masechet):
        def base_tokenizer(str):
            str = re.sub(ur"\([^\(\)]+\)", u"", str)
            word_list = re.split(ur"\s+", str)
            word_list = [w for w in word_list if w]  # remove empty strings
            return word_list

        rosh_results = []
        perek_key = {}
        for perek in sorted(self.dh_by_perek.keys()):
            tuples = filter(lambda x: x[0] is 'rosh', self.dh_by_perek[perek])
            if len(tuples) > 0:
                cats, dhs, dappim = zip(*tuples)
                #for each daf and dh pair, that's the key to get the perek
                for daf, dh in zip(list(dappim), list(dhs)):
                    perek_key[(daf, dh)] = perek
                base = Ref("Rosh on {} {}".format(masechet, perek)).text('he')
                assert len(base.text) > 0
                these_results = match_ref(
                    base,
                    list(dhs),
                    base_tokenizer,
                    dh_extract_method=self.dh_extract_method,
                    verbose=False,
                    with_num_abbrevs=False)['matches']
                assert len(tuples) is len(these_results)
                rosh_results.append(these_results)

        results = {}
        comments = {}

        for daf in sorted(self.dh1_dict.keys()):
            comments[daf] = {}
            results[daf] = {}
            for each_cat in self.categories:
                if each_cat == 'rosh':
                    continue
                comments[daf][each_cat] = self.dh_by_cat[each_cat][daf]
            for each_type in comments[daf]:
                if each_type == 'rosh':
                    continue
                results[daf][each_type] = []
                if len(comments[daf][each_type]) > 0:
                    base = self.getTC(each_type, daf, masechet)
                    if len(base.text) == 0:
                        self.comm_wout_base.write("{} {}: {}\n".format(
                            masechet, daf, each_type))
                        base = self.getTC(each_type, daf - 1, masechet)
                        combined_comments = comments[
                            daf - 1][each_type] + comments[daf][each_type]
                        if len(base.text) == 0:
                            print "Problem in {}".format(
                                AddressTalmud.toStr("en", daf))
                        else:
                            results[daf - 1][each_type] = match_ref(
                                base,
                                combined_comments,
                                base_tokenizer,
                                dh_extract_method=self.dh_extract_method,
                                verbose=False,
                                with_num_abbrevs=False)
                            results[daf -
                                    1][each_type] = self.convertToOldFormat(
                                        results[daf - 1][each_type])
                        self.dh1_dict[daf] = [
                            x for x in self.dh1_dict[daf] if x[0] != each_type
                        ]
                    else:
                        results[daf][each_type] = match_ref(
                            base,
                            comments[daf][each_type],
                            base_tokenizer,
                            dh_extract_method=self.dh_extract_method,
                            verbose=False,
                            with_num_abbrevs=False)
                        results[daf][each_type] = self.convertToOldFormat(
                            results[daf][each_type])

        prev_perek = 0
        for daf in sorted(self.dh1_dict.keys()):
            self.maharam_line = 0
            self.which_line = {
                "rashi": -1,
                "tosafot": -1,
                "rosh": -1,
                "ran": -1,
                "gemara": -1,
                "rashbam": -1
            }
            for category, dh in self.dh1_dict[daf]:
                if category == 'gemara':
                    self.Gemara(daf, results[daf])
                elif category == 'rosh':
                    perek = perek_key[(daf, dh)]
                    if perek > prev_perek:
                        self.rosh_line = -1
                    prev_perek = perek
                    self.Rosh(perek, daf, dh, rosh_results)
                else:
                    self.Commentary(daf, category, results[daf])

        post_link(self.links_to_post, server=self.server)
        self.comm_wout_base.close()
				comm_dict[daf] = []
			comm_dict[daf].append(comm)
			if just_added_dh == False:
				dh_dict[daf].append("")
			if hasTags(comm) or hasTags(dh) or hasTags(before_dh):
				pdb.set_trace()
		just_added_dh = False
		prev_line = line
result = {}
guess=0
no_guess=0
for daf in dh_dict.keys():
	if len(dh_dict[daf]) != len(comm_dict[daf]):
		pdb.set_trace()
for daf in dh_dict.keys():
	text = get_text("Niddah."+AddressTalmud.toStr("en", daf))
	try:
		match_obj=Match(in_order=True, min_ratio=70, guess=False, range=True, maxLine=len(text)-1)
	except:
		pdb.set_trace()
	dh_arr = []
	for i in range(len(dh_dict[daf])):
		if len(dh_dict[daf][i]) > 0:
			dh_arr.append(dh_dict[daf][i])
	result[daf] = match_obj.match_list(dh_arr, text)
	dh_count = 1
	'''
	if len(dh_dict[daf][i]) == 0, then comm_dict[daf][i] gets added to comm_dict[daf][i-1]+"<br>"
	'''
	for i in range(len(comm_dict[daf])):
		 if (daf, i) in before_dh_dict:
示例#60
0
		zohar_struct[vol_num].append([])
	first_line = True
	vol = open(vol_file, 'r')
	for line in vol:
		stray_tag = False
		blank_line = False
		no_spaces = line.replace(" ", "")
		no_return = no_spaces.replace("\n", "")
		if len(no_return)==0:
			blank_line = True
		if len(line.split(' '))==1 and (line.find('<b>')>=0 or line.find('</b>')>=0):
			stray_tag = True			
		if first_line == True:
			first_line = False
			if curr_parsha_file != "":
				curr_parsha_file.write('\n'+str(prev_vol+1)+":"+AddressTalmud.toStr("en", prev_daf+1)+":"+str(prev_para))
				curr_parsha_file.close()	
			if os.path.exists(english_parshiot[curr_parsha]) == True:
				os.remove(english_parshiot[curr_parsha])		
			curr_parsha_file = open(english_parshiot[curr_parsha], 'a')
			curr_parsha_file.write(str(vol_num+1)+":"+AddressTalmud.toStr("en", daf_count+2)+":1")  
			curr_parsha += 1
		elif blank_line==False and stray_tag==False:
			prev_prev_line = prev_line
			prev_line = current_line
			new_daf = line.find('דף')
			new_parsha = line.find('h1') #all parsha titles are surrounded by <h1> tags
			if new_daf >= 0 and len(line.split(' ')) < 6:  
				current_line = "daf"
				daf_count += 1
				zohar_struct[vol_num].append([])