Python AddressTalmud示例，sefaria.model.schema.AddressTalmud Python示例

示例#1

0

显示文件

文件： maharam.py 项目： smontagu/Sefaria-Data

 def Gemara(self, daf, gemara_in_order):
     self.maharam_line += 1
     self.gemara_line += 1
     gemara_in_order[self.gemara_line] = gemara_in_order[
         self.gemara_line].replace('0:', '')
     if gemara_in_order[self.gemara_line].find('-') >= 0:
         in_order, out_order = gemara_in_order[self.gemara_line].split('-')
     else:
         in_order = gemara_in_order[self.gemara_line]
         out_order = in_order
     masechet_daf_line_start = masechet + " " + AddressTalmud.toStr(
         "en", daf) + ":" + in_order
     masechet_daf_line_end = masechet + " " + AddressTalmud.toStr(
         "en", daf) + ":" + out_order
     try:
         masechet_daf_line = Ref(masechet_daf_line_start).to(
             Ref(masechet_daf_line_end)).normal()
     except:
         masechet_daf_line = masechet_daf_line_start
     self.links_to_post.append({
         "refs": [
             masechet_daf_line, "Maharam on " + masechet + "." +
             AddressTalmud.toStr("en", daf) + "." + str(self.maharam_line)
         ],
         "type":
         "commentary",
         "auto":
         True,
         "generated_by":
         "Maharam on " + masechet + " linker",
     })

示例#2

0

显示文件

def post(text, dh_dict, tractate):
    text_array = convertDictToArray(text)
    send_text = {
        "text": text_array,
        "versionTitle": "Ramban on Talmud",
        "versionSource": "http://www.sefaria.org",
        "language": "he"
    }
    post_text("Chiddushei Ramban on " + tractate, send_text)
    links_to_post = []
    daf_array = get_text_plus(tractate)['he']
    match = Match(in_order=True,
                  min_ratio=80,
                  guess=False,
                  range=True,
                  can_expand=False)
    for daf in sorted(dh_dict.keys()):
        dh_list = dh_dict[daf]
        results = match.match_list(
            dh_list, daf_array[daf - 1],
            tractate + " " + AddressTalmud.toStr("en", daf))
        for key, value in results.iteritems():
            value = value.replace("0:", "")
            talmud_end = tractate + "." + AddressTalmud.toStr(
                "en", daf) + "." + value
            ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr(
                "en", daf) + "." + str(key)
            links_to_post.append({
                'refs': [talmud_end, ramban_end],
                'type': 'commentary',
                'auto': 'True',
                'generated_by': "ramban" + tractate
            })
    post_link(links_to_post)

示例#3

0

显示文件

def post(text, dh_dict, tractate):
    text_array = convertDictToArray(text)
    send_text = {
        "text": text_array,
        "versionTitle": "Chiddushei HaRamban, Jerusalem 1928-29",
        "versionSource":
        "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001294828",
        "language": "he"
    }
    post_text("Chiddushei Ramban on " + tractate, send_text)
    links_to_post = []

    for daf in sorted(dh_dict.keys()):
        dh_list = dh_dict[daf]
        daf_text = Ref(tractate + " " +
                       AddressTalmud.toStr("en", daf)).text('he').text
        results = match.match_list(
            dh_list, daf_text, tractate + " " + AddressTalmud.toStr("en", daf))
        for key, value in results.iteritems():
            value = value.replace("0:", "")
            talmud_end = tractate + "." + AddressTalmud.toStr(
                "en", daf) + "." + value
            talmud_end = tractate + "." + AddressTalmud.toStr(
                "en", daf) + "." + value
            ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr(
                "en", daf) + "." + str(key)
            links_to_post.append({
                'refs': [talmud_end, ramban_end],
                'type': 'commentary',
                'auto': 'True',
                'generated_by': "ramban" + tractate
            })
    post_link(links_to_post)

示例#4

0

显示文件

文件： parse.py 项目： maxrabin/Sefaria-Data

def match_and_link(dhs, masechet):
    def base_tokenizer(str):
        str = re.sub(ur"\([^\(\)]+\)", u"", str)
        word_list = re.split(ur"\s+", str)
        word_list = [w for w in word_list if w]  # remove empty strings
        return word_list

    links = []

    for daf in dhs:
        talmud_text = TextChunk(Ref(masechet + "." +
                                    AddressTalmud.toStr("en", daf)),
                                lang="he")
        result = match_ref(talmud_text,
                           dhs[daf],
                           base_tokenizer=base_tokenizer,
                           create_ranges=True)['matches']
        if result != [None]:
            for count, line in enumerate(result):
                assert line is not None
                Ritva_end = "Ritva on " + masechet + "." + str(
                    AddressTalmud.toStr("en", daf)) + "." + str(count + 1)
                talmud_end = line.normal()
                links.append({
                    'refs': [Ritva_end, talmud_end],
                    'type': 'commentary',
                    'auto': 'True',
                    'generated_by': masechet + "Ritva"
                })
    post_link(links)

示例#5

0

显示文件

文件： tosafot_duplicates.py 项目： yitzchoklowy/Sefaria-Project

def print_out_refs(daf, line, segment, prev_daf, prev_line, prev_segment):
    second = "{} {}:{}:{}".format(title, AddressTalmud.toStr("en", daf + 1),
                                  line + 1, segment + 1)
    first = "{} {}:{}:{}".format(title,
                                 AddressTalmud.toStr("en", prev_daf + 1),
                                 prev_line + 1, prev_segment + 1)
    print "First: {}".format(first)
    print "Second: {}\n".format(second)

示例#6

0

显示文件

def compileCommentaryIntoPage(title, daf):
    page = []
    ref = Ref(title+" "+AddressTalmud.toStr("en", daf)+".1")
    while ref is not None and ref.normal().find(AddressTalmud.toStr("en", daf)) >= 0:
        text = ref.text('he').text
        for line in text:
            page.append(line)
        ref = ref.next_section_ref() if ref.next_section_ref() != ref else None
    return page

示例#7

0

显示文件

文件： ritva.py 项目： JonMosenkis/Sefaria-Data

def find_misshing_DH(max_length):

    """
    Run through Ritva Makkot, and search for lines with an unreasonable amount of words until the first period.
    :param max_length:
    :return:
    """
    text={}
    count, lines = 0, 0
    curr_daf=0
    probs = codecs.open('probs_ritva.txt', 'w', 'utf-8')
    files = ["chiddushei one.txt","chiddushei two.txt", "chiddushei three.txt", "chiddushei four.txt", "chiddushei five.txt"]
    for file in files:
        open_file = codecs.open(file, 'r', 'utf-8')
        for line in open_file:
            line = line.replace('\n','')
            if len(line)==0:
                continue
            if line.find(u"#")>=0:
                start=line.find(u"#1")
                end=line.find(u"#2")
                if start>end or start==-1 or end==-1:
                    print '# error'
                daf = line[start:end]
                if daf.find(u'ע"ב')>=0:
                    curr_daf += 1
                elif daf.find(u'דף')>=0:
                    daf = daf.split(u" ")[1]
                    poss_daf = 2*getGematria(daf)-1
                    if poss_daf < curr_daf:
                        print 'daf error'
                    curr_daf = poss_daf
                else:
                    print 'no daf'
            line = line.replace('@1','').replace('@2','')
            words = line.split()

            for index, word in enumerate(words):

                lines += 1

                if word.find(u'.') >= 0:
                    break

                elif index > max_length:
                    probs.write('file: ' + str(file) + "\n")
                    probs.write('current daf:' + AddressTalmud.toStr('en', curr_daf) + "\n")
                    probs.write('line without DH:\t' + ' '.join(words[:max_length]) + "\n\n\n")
                    count += 1
                    break

            else:
                probs.write(u'file: ' + str(file) + u"\n")
                probs.write(u'current daf:' + AddressTalmud.toStr('en', curr_daf) + u"\n")
                probs.write(u'line without DH:\t' + u' '.join(words) + u"\n\n\n")
                count += 1
    print count, lines

示例#8

0

显示文件

def compileCommentaryIntoPage(title, daf):
    page = []
    ref = Ref(title+" "+AddressTalmud.toStr("en", daf)+".1")
    while ref is not None and ref.normal().find(AddressTalmud.toStr("en", daf)) >= 0:
        text = ref.text('he').text
        for line in text:
            page.append(line)
        ref = ref.next_section_ref() if ref.next_section_ref() != ref else None
    return page

示例#9

0

显示文件

def compileCommentaryIntoPage(title, daf):
    page = []
    next = title + " " + AddressTalmud.toStr("en", daf) + ".1"
    while next is not None and next.find(AddressTalmud.toStr("en", daf)) >= 0:
        text = get_text_plus(next)
        for line in text['he']:
            page.append(line)

        next = text['next']
    return page

示例#10

0

显示文件

文件： functions_ste.py 项目： joshuagoldmeier/Sefaria-Data

def compileCommentaryIntoPage(title, daf):
    page = []
    next = title+" "+AddressTalmud.toStr("en", daf)+".1"
    while next is not None and next.find(AddressTalmud.toStr("en", daf)) >= 0:
        text = get_text_plus(next)
        for line in text['he']:
            page.append(line)

        next = text['next']
    return page

示例#11

0

显示文件

def match_and_link(dhs, masechet):
    match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False)
    links = []
    for daf in dhs:
        talmud_text = get_text_plus(masechet+"."+AddressTalmud.toStr("en", daf))['he']
        result = match.match_list(dhs[daf], talmud_text)
        for line in result:
            talmud_range = result[line].replace("0:", "")
            Ritva_end = "Ritva on "+masechet+"."+str(AddressTalmud.toStr("en", daf))+"."+str(line)
            talmud_end = masechet + "." + AddressTalmud.toStr("en", daf) + "." + talmud_range
            links.append({'refs': [Ritva_end, talmud_end], 'type': 'commentary', 'auto': 'True', 'generated_by': masechet+"Ritva"})
    post_link(links)

示例#12

0

显示文件

def lookForLineInCommentary(title, daf, line_n):
    total_count = 0
    next = title + " " + AddressTalmud.toStr("en", daf) + ":1"
    while next.find(AddressTalmud.toStr("en", daf)) >= 0:
        text = get_text_plus(next)
        local_count = 0
        for line in text['he']:
            local_count += 1
            total_count += 1
            if total_count == line_n:
                return next + "." + str(local_count)
        next = text['next']
    return ""

示例#13

0

显示文件

def lookForLineInCommentary(title, daf, line_n):
    total_count = 0
    ref = Ref(title+" "+AddressTalmud.toStr("en", daf)+":1")
    while ref is not None and ref.normal().find(AddressTalmud.toStr("en", daf)) >= 0:
        text = ref.text('he').text
        local_count = 0
        for line in text:
            local_count+=1
            total_count+=1
            if total_count == line_n:
                return ref.normal()+"."+str(local_count)
        ref = ref.next_section_ref() if ref.next_section_ref() != ref else None
    return ""

示例#14

0

显示文件

def lookForLineInCommentary(title, daf, line_n):
    total_count = 0
    ref = Ref(title+" "+AddressTalmud.toStr("en", daf)+":1")
    while ref is not None and ref.normal().find(AddressTalmud.toStr("en", daf)) >= 0:
        text = ref.text('he').text
        local_count = 0
        for line in text:
            local_count+=1
            total_count+=1
            if total_count == line_n:
                return ref.normal()+"."+str(local_count)
        ref = ref.next_section_ref() if ref.next_section_ref() != ref else None
    return ""

示例#15

0

显示文件

文件： functions_ste.py 项目： joshuagoldmeier/Sefaria-Data

def lookForLineInCommentary(title, daf, line_n):
    total_count = 0
    next = title+" "+AddressTalmud.toStr("en", daf)+":1"
    while next.find(AddressTalmud.toStr("en", daf)) >= 0:
        text = get_text_plus(next)
        local_count = 0
        for line in text['he']:
            local_count+=1
            total_count+=1
            if total_count == line_n:
                return next+"."+str(local_count)
        next = text['next']
    return ""

示例#16

0

显示文件

    def postLinks(self):
        def base_tokenizer(str):
            str = re.sub(ur"\([^\(\)]+\)", u"", str)
            word_list = re.split(ur"\s+", str)
            word_list = [w for w in word_list if w]  # remove empty strings
            return word_list

        mishnah_in_order = {}
        mishnah_out_order = {}
        links_to_post = []
        for daf in sorted(self.dh1_dict.keys()):
            if daf < 179:
                continue
            print daf
            self.maharam_line = 0
            self.rashi_line = -1
            self.tosafot_line = -1
            self.gemara_line = -1
            mishnah_line = 0
            tosafot1_arr = self.tosafot1_dict[daf]
            rashi1_arr = self.rashi1_dict[daf]
            gemara1_arr = self.gemara1_dict[daf]
            print "matching tosafot"+str(len(tosafot1_arr))
            tosafot_text = Ref("Tosafot on "+masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
            tosafot1_arr = [text.decode('utf-8') for text in tosafot1_arr]
            tosafot_in_order = match_ref(tosafot_text, tosafot1_arr, base_tokenizer, self.dh_extract_method, verbose=True)
            tosafot_in_order = self.convertToOldFormat(tosafot_in_order)
            if not (masechet == "Bava Batra" and daf > 57):
                print "matching rashi"+str(len(rashi1_arr))
                rashi_text = Ref("Rashi on "+masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
                rashi1_arr = [text.decode('utf-8') for text in rashi1_arr]
                rashi_in_order = match_ref(rashi_text, rashi1_arr, base_tokenizer, self.dh_extract_method, verbose=True)
                rashi_in_order = self.convertToOldFormat(rashi_in_order)
            print "matching gemara"+str(len(gemara1_arr))
            gemara_text = Ref(masechet+" "+AddressTalmud.toStr("en", daf)).text('he')
            gemara1_arr = [text.decode('utf-8') for text in gemara1_arr]
            gemara_in_order = match_ref(gemara_text, gemara1_arr, base_tokenizer, self.dh_extract_method, verbose=True)
            gemara_in_order = self.convertToOldFormat(gemara_in_order)
            dh1_arr = self.dh1_dict[daf]
            print "done matching"
            for category, dh in self.dh1_dict[daf]:
                print category
                if category == 'rashi' or category == 'tosafot':
                    self.RashiOrTosafot(daf, category, rashi_in_order, tosafot_in_order)
                elif category == 'gemara':
                    self.Gemara(daf, gemara_in_order)
                #elif category == "mishnah":
                #    self.Mishnah(daf, mishnah_in_order)
                elif category == 'paragraph' and self.maharam_line == 0:
                    self.maharam_line+=1
        post_link(self.links_to_post)

示例#17

0

显示文件

文件： parse.py 项目： YairRand/Sefaria-Data

 def Gemara(self, daf, results):
     self.maharam_line+=1
     self.which_line['gemara']+=1
     if results['gemara'][self.which_line['gemara']] == '0':
         self.missing_ones.append(self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line))
     else:
         self.links_to_post.append({
         "refs": [
                  results['gemara'][self.which_line['gemara']],
                 self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
             ],
         "type": "commentary",
         "auto": True,
         "generated_by": self.title+self.masechet+" linker",
      })

示例#18

0

显示文件

 def Gemara(self, daf, gemara_in_order):
     self.maharam_line+=1
     self.gemara_line+=1
     if gemara_in_order[self.gemara_line] == '0':
         self.missing_ones.append("Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line))
     else:
         self.links_to_post.append({
         "refs": [
                  gemara_in_order[self.gemara_line],
                 "Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
             ],
         "type": "commentary",
         "auto": True,
         "generated_by": "Maharam on "+masechet+" linker",
      })

示例#19

0

显示文件

 def getTC(self, category, daf, masechet):
     if category == "tosafot":
         return Ref("Tosafot on " + masechet + "." +
                    AddressTalmud.toStr("en", daf)).text('he')
     elif category == "gemara":
         return Ref(masechet + " " +
                    AddressTalmud.toStr("en", daf)).text('he')
     elif category == "rashi":
         rashi = Ref("Rashi on " + masechet + "." +
                     AddressTalmud.toStr("en", daf)).text('he')
         if len(rashi.text) == 0:
             return Ref("Rashbam on " + masechet + "." +
                        AddressTalmud.toStr("en", daf)).text('he')
         else:
             return rashi

示例#20

0

显示文件

文件： parse.py 项目： JonMosenkis/Sefaria-Data

 def Gemara(self, daf, results):
     self.maharam_line+=1
     self.which_line['gemara']+=1
     if results['gemara'][self.which_line['gemara']] == '0':
         self.missing_ones.append(self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line))
     else:
         self.links_to_post.append({
         "refs": [
                  results['gemara'][self.which_line['gemara']],
                 self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
             ],
         "type": "commentary",
         "auto": True,
         "generated_by": self.title+self.masechet+" linker",
      })

示例#21

0

显示文件

文件： rashi_keritot.py 项目： stevekaplan123/Sefaria-Data

def getLog(siman, result, dh_dict, comm):
    log = []
    for key in result:
        line_n = result[key]
        if line_n[0] == 0:
            append_str = (
                "did not find dh:\n"
                + str(dh_dict[siman][key - 1])
                + "\n in "
                + title_book
                + ", Daf "
                + AddressTalmud.toStr("en", siman)
                + ":"
            )
            append_str += "\nwww.sefaria.org/" + title_book.replace(" ", "_") + "." + AddressTalmud.toStr("en", siman)
            append_str += "\ntext:<b>" + str(dh_dict[siman][key - 1]) + ".</b> " + str(comm[siman][key - 1]) + "\n\n"
            log.append(append_str)
        elif len(line_n) > 1:
            bestGuess = line_n[0]
            guess_str = (
                "looked for dh:\n"
                + str(dh_dict[siman][key - 1])
                + "\n in "
                + title_book
                + ", Daf "
                + AddressTalmud.toStr("en", siman)
            )
            guess_str += " and guessed the dh matches to line " + str(bestGuess) + ":"
            title_c = title_comm.replace(" ", "_")
            guess_str += "\nwww.sefaria.org/" + title_c + "." + AddressTalmud.toStr("en", siman) + "." + str(bestGuess)
            guess_str += "\nbut other options include:\n"
            for guess in line_n:
                if guess != line_n[0]:
                    title = title_book.replace(" ", "_")
                    guess_str += (
                        "line "
                        + str(guess)
                        + ": www.sefaria.org/"
                        + title
                        + "."
                        + AddressTalmud.toStr("en", siman)
                        + "."
                        + str(guess)
                        + " ,\n"
                    )
            guess_str = guess_str[0:-1]
            log.append(guess_str + "\n\n")
    return log

示例#22

0

显示文件

文件： parse.py 项目： maxrabin/Sefaria-Data

 def Commentary(self, daf, category, results):
     self.maharam_line += 1
     self.which_line[category] += 1
     title = category.title() + " on " + self.masechet
     base_ref = results[category][self.which_line[category]]
     comm_ref = self.title + " on " + self.masechet + "." + AddressTalmud.toStr(
         "en", daf) + "." + str(self.maharam_line)
     if base_ref == '0':
         self.missing_ones.append(comm_ref)
     else:
         self.links_to_post.append({
             "refs": [base_ref, comm_ref],
             "type":
             "commentary",
             "auto":
             True,
             "generated_by":
             self.title + self.masechet + " linker"
         })
         gemara_ref = self.getGemaraRef(base_ref)
         self.links_to_post.append({
             "refs": [comm_ref, gemara_ref],
             "type":
             "commentary",
             "auto":
             True,
             "generated_by":
             self.title + self.masechet + " linker"
         })

示例#23

0

显示文件

文件： parse.py 项目： JonMosenkis/Sefaria-Data

 def Commentary(self, daf, category, results):
     self.maharam_line += 1
     self.which_line[category] += 1
     title = category.title() + " on " + self.masechet
     base_ref = results[category][self.which_line[category]]
     comm_ref = self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
     if base_ref == '0':
         self.missing_ones.append(comm_ref)
     else:
         self.links_to_post.append({
             "refs": [
                          base_ref,
                         comm_ref
                     ],
             "type": "commentary",
             "auto": True,
             "generated_by": self.title+self.masechet+" linker"
         })
         gemara_ref = self.getGemaraRef(base_ref)
         self.links_to_post.append({
             "refs": [
                 comm_ref,
                 gemara_ref
             ],
             "type": "commentary",
             "auto": True,
             "generated_by": self.title+self.masechet+" linker"
         })

示例#24

0

显示文件

文件： parse.py 项目： Sefaria/Sefaria-Data

def find_matches(gemara, tosafot, rashi):
    # what needs to be done is to go through each dict and try to match everything, but check each segment that if it is בא"ד
    # ignore if it has a match and match it to previous segment's match
    # and if no match: link with previous segment (as a range) as if this comment really has no DH which is why it has no match

    nones = total = 0
    for pairs in [(tosafot, "Tosafot on Ketubot"), (gemara, "Ketubot"), (rashi, "Rashi on Ketubot")]:
        orig_dict = dict(pairs[0])
        which_dict = pairs[0]
        which_text = pairs[1]
        for daf in which_dict.keys():
            actual_daf = AddressTalmud.toStr("en", daf)
            base_text = TextChunk(Ref("{} {}".format(which_text, actual_daf)), lang='he')
            if not base_text.text:
                continue
            comments = which_dict[daf]
            results = match_ref(base_text, comments, lambda x: x.split(), dh_extract_method=dh_extract)
            for i, result_comment in enumerate(zip(results["matches"], comments)):
                result, comment = result_comment
                comment_wout_bold = comment.replace("<b>", "").replace("</b>", "")
                if u"""בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]) \
                        or u"""שם בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]):
                    results["matches"][i] = results["matches"][i - 1]
            which_dict[daf] = results["matches"]


        for daf in which_dict.keys():
             if which_dict[daf] and orig_dict[daf]:
                which_dict[daf] = create_ranges(orig_dict, which_dict, which_text, daf)
    return gemara, tosafot, rashi

示例#25

0

显示文件

文件： parse.py 项目： maxrabin/Sefaria-Data

    def getDaf(self, line, current_daf, len_masechet, prev_line):
        prev_num = self.current_daf
        orig_line = line
        line = line.replace("@11 ", "@11")
        if line.split(" ")[0].find('דף') >= 0:
            daf_value = getGematria(
                line.split(" ")[1].replace('"', '').replace("'", ''))
            if line.split(" ")[2].find(self.amud_bet) >= 0:
                self.current_daf = 2 * daf_value
            else:
                self.current_daf = 2 * daf_value - 1
            actual_text = ""
            start_at = 3
            if line.split(" ")[2] not in ['ע"ב', 'ע"א']:
                start_at = 2
            for count, word in enumerate(line.split(" ")):
                if count >= start_at:
                    actual_text += word + " "
        else:
            self.current_daf += 1
            actual_text = line[3:]

        if self.current_daf <= prev_num:
            he_current = AddressTalmud.toStr("he", self.current_daf)
            he_prev = AddressTalmud.toStr("he", prev_num)
            #prev_line = " ".join(prev_line.split(" ")[0:5])
            #orig_line = " ".join(orig_line.split(" ")[0:5])
            print u"{} before {}\n".format(he_prev, he_current)
            self.dont_post = True
            #print u"The line starting: {} is {}\n".format(prev_line, he_prev)
            #print u"It came before the line starting {}, which is {}\n\n".format(orig_line, he_current)

        if not self.current_daf in self.dh1_dict:
            self.dh1_dict[self.current_daf] = []
            for each_cat in self.categories:
                self.dh_by_cat[each_cat][self.current_daf] = []
        self.actual_text = actual_text
        if self.current_daf > len_masechet:
            print "DAF EXTRA {} > {} in {} {}".format(self.current_daf,
                                                      len_masechet, self.title,
                                                      self.masechet)
            pass
        self.list_of_dafs.append(self.current_daf)

        return self.current_daf

示例#26

0

显示文件

    def postLinks(self):
        def base_tokenizer(str):
            str = re.sub(ur"\([^\(\)]+\)", u"", str)
            word_list = re.split(ur"\s+", str)
            word_list = [w for w in word_list if w]  # remove empty strings
            return word_list

        def dh_extract_method(str):
            str = str.replace(u'בד"ה', u'').replace(u'וכו', u'')
            return str

        '''
        1. strip out "" from dhs with list comprehension
        2. make dictionary where each dh str is key and the value is its index in the array
        '''
        links = []
        for daf in self.text:
            dhs_arr = [dh for dh in self.dhs[daf] if len(dh) > 0]
            gemara_text = Ref("{} {}".format(self.tractate,
                                             AddressTalmud.toStr(
                                                 "en", daf))).text('he')
            results = match_ref(gemara_text,
                                dhs_arr,
                                base_tokenizer,
                                dh_extract_method=dh_extract_method,
                                verbose=False)['matches']
            self.makeDicts(daf)
            rashba_refs = []
            for dh in dhs_arr:
                rashba_refs.append("Rashba on {} {}.{}".format(
                    self.tractate, AddressTalmud.toStr("en", daf),
                    self.dh_dict[daf][dh] + 1))
            link_pairs = zip(rashba_refs, results)
            for link_pair in link_pairs:
                if link_pair[1]:
                    links.append({
                        "refs": [link_pair[0], link_pair[1].normal()],
                        "type":
                        "commentary",
                        "auto":
                        True,
                        "generated_by":
                        "rashba{}".format(self.tractate)
                    })
        post_link(links, server=self.server)

示例#27

0

显示文件

文件： parse.py 项目： JonMosenkis/Sefaria-Data

    def getDaf(self, line, current_daf, len_masechet, prev_line):
        prev_num = self.current_daf
        orig_line = line
        line = line.replace("@11 ", "@11")
        if line.split(" ")[0].find('דף')>=0:
            daf_value = getGematria(line.split(" ")[1].replace('"', '').replace("'", ''))
            if line.split(" ")[2].find(self.amud_bet)>=0:
                self.current_daf = 2*daf_value
            else:
                self.current_daf = 2*daf_value - 1
            actual_text = ""
            start_at = 3
            if line.split(" ")[2] not in ['ע"ב', 'ע"א']:
                start_at = 2
            for count, word in enumerate(line.split(" ")):
                if count >= start_at:
                    actual_text += word + " "
        else:
            self.current_daf += 1
            actual_text = line[3:]

        if self.current_daf <= prev_num:
            he_current = AddressTalmud.toStr("he", self.current_daf)
            he_prev = AddressTalmud.toStr("he", prev_num)
            #prev_line = " ".join(prev_line.split(" ")[0:5])
            #orig_line = " ".join(orig_line.split(" ")[0:5])
            print u"{} before {}\n".format(he_prev, he_current)
            self.dont_post = True
            #print u"The line starting: {} is {}\n".format(prev_line, he_prev)
            #print u"It came before the line starting {}, which is {}\n\n".format(orig_line, he_current)


        if not self.current_daf in self.dh1_dict:
            self.dh1_dict[self.current_daf] = []
            for each_cat in self.categories:
                self.dh_by_cat[each_cat][self.current_daf] = []
        self.actual_text = actual_text
        if self.current_daf > len_masechet:
            print "DAF EXTRA {} > {} in {} {}".format(self.current_daf, len_masechet, self.title, self.masechet)
            pass
        self.list_of_dafs.append(self.current_daf)

        return self.current_daf

示例#28

0

显示文件

    def RashiOrTosafot(self, daf, category, results):
        self.maharam_line += 1
        if category == 'rashi':
            self.rashi_line += 1
            title = 'Rashi on ' + masechet
            ref = results[category][self.rashi_line]
        elif category == 'tosafot':
            self.tosafot_line += 1
            title = 'Tosafot on ' + masechet
            ref = results[category][self.tosafot_line]

        if ref == '0':
            self.missing_ones.append("Maharshal on " + masechet + "." +
                                     AddressTalmud.toStr("en", daf) + "." +
                                     str(self.maharam_line))
        else:
            self.links_to_post.append({
                "refs": [
                    ref, "Maharshal on " + masechet + "." +
                    AddressTalmud.toStr("en", daf) + "." +
                    str(self.maharam_line)
                ],
                "type":
                "commentary",
                "auto":
                True,
                "generated_by":
                "Maharshal on " + masechet + " linker"
            })
            gemara_ref = self.getGemaraRef(ref)
            self.links_to_post.append({
                "refs": [
                    "Maharshal on " + masechet + "." +
                    AddressTalmud.toStr("en", daf) + "." +
                    str(self.maharam_line), gemara_ref
                ],
                "type":
                "commentary",
                "auto":
                True,
                "generated_by":
                "Maharshal on " + masechet + " linker"
            })

示例#29

0

显示文件

文件： function.py 项目： smontagu/Sefaria-Data

def create_link_text(source_index, line_number, comment_number):
    amud_number = AddressTalmud.toStr('en', source_index)
    return {
        "refs": [
            "Sanhedrin {}.{}".format(amud_number, line_number),
            "Yad Ramah on Sanhedrin {}.{}".format(amud_number, comment_number)
        ],
        "type":
        "commentary",
    }

示例#30

0

显示文件

文件： ramban.py 项目： joshuagoldmeier/Sefaria-Data

def post(text, dh_dict, tractate):
     text_array = convertDictToArray(text)
     send_text = {
         "text": text_array,
         "versionTitle": "Ramban on Talmud",
         "versionSource": "http://www.sefaria.org",
         "language": "he"
     }
     post_text("Chiddushei Ramban on "+tractate, send_text)
     links_to_post = []
     daf_array = get_text_plus(tractate)['he']
     match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False)
     for daf in sorted(dh_dict.keys()):
         dh_list = dh_dict[daf]
         results = match.match_list(dh_list, daf_array[daf-1], tractate+" "+AddressTalmud.toStr("en", daf))
         for key, value in results.iteritems():
             value = value.replace("0:", "")
             talmud_end = tractate + "." + AddressTalmud.toStr("en", daf) + "." + value
             ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr("en", daf) + "." + str(key)
             links_to_post.append({'refs': [talmud_end, ramban_end], 'type': 'commentary', 'auto': 'True', 'generated_by': "ramban"+tractate})
     post_link(links_to_post)

示例#31

0

显示文件

def get_matches_for_dict_and_link(dh_dict, base_text_title, commentary_title, talmud=True, lang='he', word_threshold=0.27, server="", rashi_filter=None, dh_extract_method=lambda x: x):
    def base_tokenizer(str):
        str_list = str.split(" ")
        return [str for str in str_list if len(str) > 0]


    assert len(server) > 0, "Please specify a server"
    results = {}
    links = []
    matched = 0
    total = 0
    for daf in dh_dict:
        print daf
        dhs = dh_dict[daf]
        if talmud:
            base_text_ref = "{} {}".format(base_text_title, AddressTalmud.toStr("en", daf))
            comm_ref = "{} on {} {}".format(commentary_title, base_text_title, AddressTalmud.toStr("en", daf))
        else:
            base_text_ref = "{} {}".format(base_text_title, daf)
            comm_ref = "{} on {} {}".format(commentary_title, base_text_title, daf)
        base_text = TextChunk(Ref(base_text_ref), lang=lang)
        comm_text = TextChunk(Ref(comm_ref), lang=lang)
        results[daf] = match_ref(base_text, comm_text, base_tokenizer=base_tokenizer, word_threshold=word_threshold, rashi_filter=rashi_filter, dh_extract_method=dh_extract_method)["matches"]
        for count, link in enumerate(results[daf]):
            if link:
                base_end = link.normal()
                comm_end = "{} on {} {}:{}".format(commentary_title, base_text_title, AddressTalmud.toStr("en", daf), count+1)
                links.append({
                    "refs": [base_end, comm_end],
                    "auto": True,
                    "type": "commentary",
                    "generated_by": commentary_title+base_text_title
                })
                matched += 1
            total += 1
    print "Matched: {}".format(matched)
    print "Total {}".format(total)
    post_link(links, server=server)

    return results

示例#32

0

显示文件

文件： parse.py 项目： YairRand/Sefaria-Data

 def Rosh(self, perek, daf, dh, results):
     self.maharam_line += 1
     self.rosh_line += 1
     if results[perek-1][self.rosh_line]:
         self.links_to_post.append({
             "refs": [
                      results[perek-1][self.rosh_line].normal(),
                     self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
                 ],
             "type": "commentary",
             "auto": True,
             "generated_by": self.title+self.masechet+" linker",
          })

示例#33

0

显示文件

 def RashiOrTosafot(self, daf, category, rashi_in_order, tosafot_in_order):
     if category == 'rashi':
         self.maharam_line+=1
         self.rashi_line+=1
         title = 'Rashi on '+masechet
         in_order = rashi_in_order[self.rashi_line]
     elif category == 'tosafot':
         self.maharam_line+=1
         self.tosafot_line+=1
         title = 'Tosafot on '+masechet
         in_order = tosafot_in_order[self.tosafot_line]
     if in_order == '0':
         self.missing_ones.append("Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line))
     else:
         self.links_to_post.append({
             "refs": [
                          in_order,
                         "Maharam on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
                     ],
             "type": "commentary",
             "auto": True,
             "generated_by": "Maharam on "+masechet+" linker"})

示例#34

0

显示文件

文件： parse.py 项目： JonMosenkis/Sefaria-Data

 def Rosh(self, perek, daf, dh, results):
     self.maharam_line += 1
     self.rosh_line += 1
     if results[perek-1][self.rosh_line]:
         self.links_to_post.append({
             "refs": [
                      results[perek-1][self.rosh_line].normal(),
                     self.title+" on "+self.masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(self.maharam_line)
                 ],
             "type": "commentary",
             "auto": True,
             "generated_by": self.title+self.masechet+" linker",
          })

示例#35

0

显示文件

文件： ramban.py 项目： JonMosenkis/Sefaria-Data

def post(text, dh_dict, tractate):
     text_array = convertDictToArray(text)
     send_text = {
         "text": text_array,
         "versionTitle": "Chiddushei HaRamban, Jerusalem 1928-29",
         "versionSource": "http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001294828",
         "language": "he"
     }
     post_text("Chiddushei Ramban on "+tractate, send_text)
     links_to_post = []

     for daf in sorted(dh_dict.keys()):
         dh_list = dh_dict[daf]
         daf_text = Ref(tractate+" "+AddressTalmud.toStr("en", daf)).text('he').text
         results = match.match_list(dh_list, daf_text, tractate+" "+AddressTalmud.toStr("en", daf))
         for key, value in results.iteritems():
             value = value.replace("0:", "")
             talmud_end = tractate + "." + AddressTalmud.toStr("en", daf) + "." + value
             talmud_end = tractate + "." + AddressTalmud.toStr("en", daf) + "." + value
             ramban_end = "Chiddushei_Ramban_on_" + tractate + "." + AddressTalmud.toStr("en", daf) + "." + str(key)
             links_to_post.append({'refs': [talmud_end, ramban_end], 'type': 'commentary', 'auto': 'True', 'generated_by': "ramban"+tractate})
     post_link(links_to_post)

示例#36

0

显示文件

def getLog(siman, result, dh_dict, comm):
	log = []
	for key in result:
		line_n = result[key]
		if line_n[0] == 0:
			append_str = "did not find dh:\n"+str(dh_dict[siman][key-1])+"\n in "+title_book+", Daf "+AddressTalmud.toStr("en", siman)+":"
			append_str += "\nwww.sefaria.org/"+title_book.replace(" ", "_")+"."+AddressTalmud.toStr("en", siman)
			append_str += "\ntext:<b>"+str(dh_dict[siman][key-1])+".</b> "+str(comm[siman][key-1])+"\n\n"
			log.append(append_str)
		elif len(line_n) > 1:
			bestGuess = line_n[0]
			guess_str = "looked for dh:\n"+str(dh_dict[siman][key-1])+"\n in "+title_book+", Daf "+AddressTalmud.toStr("en", siman)
			guess_str += " and guessed the dh matches to line "+str(bestGuess)+":"
			title_c = title_comm.replace(" ", "_")
			guess_str += "\nwww.sefaria.org/"+title_c+"."+AddressTalmud.toStr("en", siman)+"."+str(bestGuess)
			guess_str += "\nbut other options include:\n"
			for guess in line_n:
				if guess != line_n[0]:
					title = title_book.replace(" ", "_")
					guess_str += "line " +str(guess)+": www.sefaria.org/"+title+"."+AddressTalmud.toStr("en", siman)+"."+str(guess)+" ,\n"
			guess_str = guess_str[0:-1]
			log.append(guess_str+"\n\n")
	return log

示例#37

0

显示文件

文件： rashba.py 项目： JonMosenkis/Sefaria-Data

 def postLinks(self):
     def base_tokenizer(str):
         str = re.sub(ur"\([^\(\)]+\)", u"", str)
         word_list = re.split(ur"\s+", str)
         word_list = [w for w in word_list if w]  # remove empty strings
         return word_list
     def dh_extract_method(str):
         str = str.replace(u'בד"ה', u'').replace(u'וכו', u'')
         return str
     '''
     1. strip out "" from dhs with list comprehension
     2. make dictionary where each dh str is key and the value is its index in the array
     '''
     links = []
     for daf in self.text:
         dhs_arr = [dh for dh in self.dhs[daf] if len(dh) > 0]
         gemara_text = Ref("{} {}".format(self.tractate, AddressTalmud.toStr("en", daf))).text('he')
         results = match_ref(gemara_text, dhs_arr, base_tokenizer, dh_extract_method=dh_extract_method, verbose=False)['matches']
         self.makeDicts(daf)
         rashba_refs = []
         for dh in dhs_arr:
             rashba_refs.append("Rashba on {} {}.{}".format(self.tractate, AddressTalmud.toStr("en", daf), self.dh_dict[daf][dh]+1))
         link_pairs = zip(rashba_refs, results)
         for link_pair in link_pairs:
             if link_pair[1]:
                 links.append(
                 {
                 "refs": [
                              link_pair[0],
                             link_pair[1].normal()
                         ],
                 "type": "commentary",
                 "auto": True,
                 "generated_by": "rashba{}".format(self.tractate)
                 }
                 )
     post_link(links, server=self.server)

示例#38

0

显示文件

文件： parse.py 项目： smontagu/Sefaria-Data

def match_and_link(dhs, masechet):
    match = Match(in_order=True,
                  min_ratio=80,
                  guess=False,
                  range=True,
                  can_expand=False)
    links = []
    for daf in dhs:
        talmud_text = get_text_plus(masechet + "." +
                                    AddressTalmud.toStr("en", daf))['he']
        result = match.match_list(dhs[daf], talmud_text)
        for line in result:
            talmud_range = result[line].replace("0:", "")
            Ritva_end = "Ritva on " + masechet + "." + str(
                AddressTalmud.toStr("en", daf)) + "." + str(line)
            talmud_end = masechet + "." + AddressTalmud.toStr(
                "en", daf) + "." + talmud_range
            links.append({
                'refs': [Ritva_end, talmud_end],
                'type': 'commentary',
                'auto': 'True',
                'generated_by': masechet + "Ritva"
            })
    post_link(links)

示例#39

0

显示文件

文件： parse.py 项目： agvania/Sefaria-Data

def match_and_link(text, masechet):
	match = Match(in_order=True, min_ratio=80, guess=False, range=True, can_expand=False)
	for daf_count, daf in enumerate(text):
		dhs = []
		comments = []
		for each_line in daf:
			if each_line.find("כו'") >= 0:
				dh, comment = each_line.split("כו'", 1)
			elif each_line.find(".") >= 0:
				dh, comment = each_line.split(".", 1)
			else:
				dh, comment = splitText(each_line, 10)
			dhs.append(dh)
			comments.append(comment)
		pdb.set_trace()
		talmud_text = get_text_plus(masechet+"."+AddressTalmud.toStr("en", daf_count+3))['he']
		result = match.match_list(dhs, talmud_text)

示例#40

0

显示文件

文件： parse.py 项目： YairRand/Sefaria-Data

 def getTC(self, category, daf, masechet):
     if category in ["tosafot", "ran", "rosh"]:
         title = "{} on {}".format(category.title(), masechet)
         return Ref(title+"."+AddressTalmud.toStr("en", daf)).text('he')
     elif category == "gemara":
         return Ref(masechet+" "+AddressTalmud.toStr("en", daf)).text('he')
     elif category == "rashi":
         rashi = Ref("Rashi on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
         if len(rashi.text) == 0:
             print "rashbam by default {} {}".format(masechet, AddressTalmud.toStr("en", daf))
             return Ref("Rashbam on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
         else:
             return rashi
     elif category == "rashbam":
         print "rashbam {} {}".format(masechet, AddressTalmud.toStr("en", daf))
         return Ref("Rashbam on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')

示例#41

0

显示文件

文件： parse.py 项目： JonMosenkis/Sefaria-Data

 def getTC(self, category, daf, masechet):
     if category in ["tosafot", "ran", "rosh"]:
         title = "{} on {}".format(category.title(), masechet)
         return Ref(title+"."+AddressTalmud.toStr("en", daf)).text('he')
     elif category == "gemara":
         return Ref(masechet+" "+AddressTalmud.toStr("en", daf)).text('he')
     elif category == "rashi":
         rashi = Ref("Rashi on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
         if len(rashi.text) == 0:
             print "rashbam by default {} {}".format(masechet, AddressTalmud.toStr("en", daf))
             return Ref("Rashbam on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')
         else:
             return rashi
     elif category == "rashbam":
         print "rashbam {} {}".format(masechet, AddressTalmud.toStr("en", daf))
         return Ref("Rashbam on "+self.masechet+"."+AddressTalmud.toStr("en", daf)).text('he')

示例#42

0

显示文件

文件： maharam.py 项目： smontagu/Sefaria-Data

 def Mishnah(self, daf, mishnah_in_order):
     self.maharam_line += 1
     mishnah_line += 1
     pos = 0
     for perek in self.mishnah1_dict:
         for key in mishnah_in_order[perek]:
             pos += 1
             if pos == mishnah_line:
                 mishnah_in_order[perek][key] = mishnah_in_order[perek][
                     key].replace('0:', '')
                 if mishnah_in_order[perek][key].find('-') >= 0:
                     in_order, out_order = mishnah_in_order[perek][
                         key].split('-')
                 else:
                     in_order = mishnah_in_order[perek][key]
                     out_order = in_order
                 in_order = int(in_order)
                 out_order = int(out_order)
                 masechet_daf_line_start = "Mishnah " + masechet + "." + str(
                     perek) + "." + str(mishnah_in_order[perek][key][0])
                 masechet_daf_line_end = "Mishnah " + masechet + "." + str(
                     perek) + "." + str(mishnah_out_order[perek][key][0])
                 try:
                     masechet_daf_line = Ref(masechet_daf_line_start).to(
                         Ref(masechet_daf_line_end)).normal()
                 except:
                     masechet_daf_line = masechet_daf_line_start
                 self.links_to_post.append({
                     "refs": [
                         masechet_daf_line, "Maharam " + masechet + "." +
                         AddressTalmud.toStr("en", daf) + "." +
                         str(self.maharam_line)
                     ],
                     "type":
                     "commentary",
                     "auto":
                     True,
                     "generated_by":
                     "Maharam on " + masechet + " linker",
                 })

示例#43

0

显示文件

def find_matches(gemara, tosafot, rashi):
    # what needs to be done is to go through each dict and try to match everything, but check each segment that if it is בא"ד
    # ignore if it has a match and match it to previous segment's match
    # and if no match: link with previous segment (as a range) as if this comment really has no DH which is why it has no match

    nones = total = 0
    for pairs in [(tosafot, "Tosafot on Ketubot"), (gemara, "Ketubot"),
                  (rashi, "Rashi on Ketubot")]:
        orig_dict = dict(pairs[0])
        which_dict = pairs[0]
        which_text = pairs[1]
        for daf in which_dict.keys():
            actual_daf = AddressTalmud.toStr("en", daf)
            base_text = TextChunk(Ref("{} {}".format(which_text, actual_daf)),
                                  lang='he')
            if not base_text.text:
                continue
            comments = which_dict[daf]
            results = match_ref(base_text,
                                comments,
                                lambda x: x.split(),
                                dh_extract_method=dh_extract)
            for i, result_comment in enumerate(
                    zip(results["matches"], comments)):
                result, comment = result_comment
                comment_wout_bold = comment.replace("<b>",
                                                    "").replace("</b>", "")
                if u"""בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]) \
                        or u"""שם בא"ד""" in u" ".join(comment_wout_bold.split()[0:3]):
                    results["matches"][i] = results["matches"][i - 1]
            which_dict[daf] = results["matches"]

        for daf in which_dict.keys():
            if which_dict[daf] and orig_dict[daf]:
                which_dict[daf] = create_ranges(orig_dict, which_dict,
                                                which_text, daf)
    return gemara, tosafot, rashi

示例#44

0

显示文件

文件： function.py 项目： smontagu/Sefaria-Data

def create_links(sanhedrin_ja, yad_ramah_ja):
    list_of_links = []
    amud_number = 1
    match_object = Match(in_order=True,
                         min_ratio=80,
                         guess=False,
                         range=False,
                         can_expand=True)
    for amud_of_sanhedrin, amud_yad_ramah in zip(sanhedrin_ja, yad_ramah_ja):
        ref = 'Sanhedrin {}'.format(AddressTalmud.toStr('en', amud_number))
        the_first_few_words = take_the_first_few_words_of_each_paragraph(
            amud_yad_ramah)
        matches_dict = match_object.match_list(the_first_few_words,
                                               amud_of_sanhedrin, ref)
        for key in matches_dict:
            for match in matches_dict[key]:
                if match != 0:
                    # print'Amud: {} comment: {} corresponds to {}'.format(AddressTalmud.toStr('en', amud_number), key, match)
                    print create_link_text(amud_number, match, key)
                    list_of_links.append(
                        create_link_text(amud_number, match, key))
        amud_number += 1

    return list_of_links

示例#45

0

显示文件

            pdb.set_trace()
        before_dh = ""
        just_added_dh = False
        prev_line = line
        temp_text = ""

for daf in comm_dict.keys():
    if daf not in dh_dict.keys():
        pdb.set_trace()
        send_text = {
            "versionTitle": "Rashba on Bava Batra",
            "versionSource": "http://www.sefaria.org",
            "language": "en",
            "text": comm_dict[daf],
        }
        post_text("Rashba on Bava Batra." + AddressTalmud.toStr("en", daf),
                  send_text)
result = {}
guess = 0
no_guess = 0
for daf in dh_dict.keys():
    text = get_text("Bava Batra." + AddressTalmud.toStr("en", daf))
    try:
        match_obj = Match(in_order=True,
                          min_ratio=70,
                          guess=False,
                          range=True,
                          maxLine=len(text) - 1)
    except:
        pdb.set_trace()
    dh_arr = []

示例#46

0

显示文件

文件： ben yehoyada.py 项目： YishaiGlasner/Sefaria-Data

         text_dict[perek] = convertDictToArray(text_dict[perek],
                                               empty="")
 links = []
 send_text = {
     "text": convertDictToArray(text_dict),
     "versionTitle":
     "Senlake edition 2019 based on Ben Yehoyada, Jerusalem, 1897",
     "versionSource":
     "http://beta.nli.org.il/he/books/NNL_ALEPH001933802/NLIl",
     "language": "he"
 }
 post_text("Ben Yehoyada on {}".format(title),
           send_text,
           index_count="on")
 for daf, text in text_dict.items():
     daf = AddressTalmud.toStr("en", daf) if title != "Eduyot" else daf
     try:
         base = TextChunk(Ref("{} {}".format(title, daf)), lang='he')
     except InputError as e:
         print(e)
         continue
     try:
         results = match_ref(base,
                             text,
                             lambda x: x.split(),
                             dh_extract_method=dher)
         for i, ref in enumerate(results["matches"]):
             if ref:
                 berakhot = "Ben Yehoyada on {} {}:{}".format(
                     title, daf, i + 1)
                 links.append({

示例#47

0

显示文件

book = {}
total = 0
non_match = 0
guess = 0
matched = 0
log = []
dh_dict = {}
tosafot_comments = {}
prev_line = 0
for j in range(78):
    i = j + 100
    count = 0
    tosafot_comments[i + 3] = []
    dh_dict[i + 3] = []
    he_daf = u"גיטין_"
    he_daf += AddressTalmud.toStr("he", i + 3)
    he_daf = he_daf.replace(u"\u05f4", u"")
    he_daf = he_daf.replace(u"׳", u"")
    he_daf = he_daf.replace(" ", "_")
    he_daf = he_daf + ".txt"
    f = open("../Noah-Santacruz-rashiPosting/Tosafot/" + he_daf, 'r')
    for line in f:
        line = line.replace("\n", "")
        something = line.replace(" ", "")
        if len(something) > 0:
            if count % 2 == 1:
                tosafot_comments[i + 3].append(line)
                dh = line.split(".")[0]
                dh_dict[i + 3].append(dh)
            count += 1
    f.close()

示例#48

0

显示文件

文件： tosafot_avodah_zarah.py 项目： joshuagoldmeier/Sefaria-Data

comm = {}
book = {}
total = 0
non_match = 0
guess = 0
matched = 0
log = []
dh_dict = {}
rashi_comments = {}
prev_line = 0
for i in range(150): #152 
	count = 0
	rashi_comments[i+3] = []
	dh_dict[i+3] = []
	he_daf = u"עבודה זרה_"
	he_daf += AddressTalmud.toStr("he", i+3)
	he_daf = he_daf.replace(u"\u05f4", u"")
	he_daf = he_daf.replace(u"׳", u"")
	he_daf = he_daf.replace(" ", "_")
	he_daf = he_daf + ".txt"
	f = open("../Noah-Santacruz-rashiPosting/Tosafot/"+he_daf, 'r')
	for line in f:
		line = line.replace("\n", "")
		something = line.replace(" ", "")
		if len(something) > 0:
			if count % 2 == 0:
				dh_dict[i+3].append(line)
			else:
				if line.find(" - ")==-1:
					line = line.replace(".", " - ", 1)
				rashi_comments[i+3].append(line)

示例#49

0

显示文件

文件： rashba_gittin.py 项目： BenjaminKozuch/Sefaria-Data

			comm_dict[daf].append(comm)
			if hasTags(comm) or hasTags(dh) or hasTags(before_dh):
				pdb.set_trace()
			if just_added_dh == False:
				dh_dict[daf].append("")
			just_added_dh = False
		before_dh =""
		temp_text = ""
result = {}
guess=0
no_guess=0
for daf in dh_dict.keys():
	if len(dh_dict[daf]) != len(comm_dict[daf]):
		pdb.set_trace()
for daf in dh_dict.keys():
	text = get_text("Gittin."+AddressTalmud.toStr("en", daf))
	try:
		match_obj=Match(in_order=True, min_ratio=70, guess=False, range=True, maxLine=len(text)-1)
	except:
		pdb.set_trace()
	dh_arr = []
	for i in range(len(dh_dict[daf])):
		if len(dh_dict[daf][i]) > 0:
			dh_arr.append(dh_dict[daf][i])
	result[daf] = match_obj.match_list(dh_arr, text)
	dh_count = 1
	'''
	if len(dh_dict[daf][i]) == 0, then comm_dict[daf][i] gets added to comm_dict[daf][i-1]+"<br>"
	'''
	for i in range(len(comm_dict[daf])):
		 if (daf, i) in before_dh_dict:

示例#50

0

显示文件

guess = 0
matched = 0
log = []
dh_dict = {}
rashi_comments = {}
prev_line = 0

title_book = "Keritot"
title_comm = "Tosafot on Keritot"

for i in range(54):
    count = 0
    rashi_comments[i + 3] = []
    dh_dict[i + 3] = []
    he_daf = u"כריתות_"
    he_daf += AddressTalmud.toStr("he", i + 3)
    he_daf = he_daf.replace(u"\u05f4", u"")
    he_daf = he_daf.replace(u"׳", u"")
    he_daf = he_daf.replace(" ", "_")
    he_daf = he_daf + ".txt"
    f = open("../Noah-Santacruz-rashiPosting/Tosafot/" + he_daf, 'r')
    for line in f:
        line = line.replace("\n", "")
        something = line.replace(" ", "")
        if len(something) > 0:
            if count % 2 == 0:
                dh_dict[i + 3].append(line)
            else:
                if line.find(" - ") == -1:
                    line = line.replace(".", " - ", 1)
                rashi_comments[i + 3].append(line)

示例#51

0

显示文件

文件： yad_ramah2.py 项目： joshuagoldmeier/Sefaria-Data

			"versionSource": "http://www.sefaria.org/",
			"language": "he",
			"text": [comm],
			}
		post_text("Yad Ramah on Bava Batra, Perek "+str(current_perek)+", Comment "+str(comment_key), text)

match_obj=Match(in_order=True, min_ratio=80, guess=False, range=True)
skipped_arr = []
result = {}
for current_perek in range(10):
	current_perek+=1
	print current_perek
	search_for = 0
	for daf in sorted(daf_dict[current_perek].keys()):			
		print daf
		text = get_text("Bava Batra."+AddressTalmud.toStr("en", daf))
		dh_list = daf_dict[current_perek][daf]
		result[daf] = match_obj.match_list(dh_list, text, "Bava Batra "+AddressTalmud.toStr("en", daf))
		print result[daf]
		for key in result[daf]:
			if result[daf][key].find("0:") >= 0:
				result[daf][key] = result[daf][key].replace("0:","")
			search_for += 1
			line_n = result[daf][key]
			count = 0
			for comment_key in comments_order[current_perek]:
				count+=1
				if comment_key not in comm_dict[current_perek]:
					if comment_key not in skipped_arr:
						search_for+=1
						skipped_arr.append(comment_key)

示例#52

0

显示文件

文件： tosefot_bava_metzia.py 项目： ngocthanhit/Sefaria-Data

book = {}
total = 0
non_match = 0
guess = 0
matched = 0
log = []
dh_dict = {}
tosafot_comments = {}
prev_line = 0
for j in range(24):  # 234
    i = j + 210
    count = 0
    tosafot_comments[i + 3] = []
    dh_dict[i + 3] = []
    he_daf = u"בבא מציעא_"
    he_daf += AddressTalmud.toStr("he", i + 3)
    he_daf = he_daf.replace(u"\u05f4", u"")
    he_daf = he_daf.replace(u"׳", u"")
    he_daf = he_daf.replace(" ", "_")
    he_daf = he_daf + ".txt"
    f = open("../Noah-Santacruz-rashiPosting/Tosafot/" + he_daf, "r")
    for line in f:
        line = line.replace("\n", "")
        something = line.replace(" ", "")
        if len(something) > 0:
            if count % 2 == 1:
                tosafot_comments[i + 3].append(line)
                dh = line.split(".")[0]
                dh_dict[i + 3].append(dh)
            count += 1
    f.close()

示例#53

0

显示文件

文件： rashi_keritot.py 项目： stevekaplan123/Sefaria-Data

guess = 0
matched = 0
log = []
dh_dict = {}
rashi_comments = {}
prev_line = 0

title_book = "Keritot"
title_comm = "Rashi on Keritot"

for i in range(54):
    count = 0
    rashi_comments[i + 3] = []
    dh_dict[i + 3] = []
    he_daf = u"כריתות_"
    he_daf += AddressTalmud.toStr("he", i + 3)
    he_daf = he_daf.replace(u"\u05f4", u"")
    he_daf = he_daf.replace(u"׳", u"")
    he_daf = he_daf.replace(" ", "_")
    he_daf = he_daf + ".txt"
    f = open("../Noah-Santacruz-rashiPosting/Rashi/" + he_daf, "r")
    for line in f:
        line = line.replace("\n", "")
        something = line.replace(" ", "")
        if len(something) > 0:
            if count % 2 == 0:
                dh_dict[i + 3].append(line)
            else:
                rashi_comments[i + 3].append(line)
            count += 1
    f.close()

示例#54

0

显示文件

文件： tosafot_avodah_zarah.py 项目： ngocthanhit/Sefaria-Data

matched = 0
log = []
dh_dict = {}
rashi_comments = {}
prev_line = 0

title_book = "Avodah Zarah"
title_comm = "Tosafot on Avodah Zarah" 

for j in range(2): 
	i = j+
	count = 0
	rashi_comments[i+3] = []
	dh_dict[i+3] = []
	he_daf = u"עבודה_זרה_"
	he_daf += AddressTalmud.toStr("he", i+3)
	he_daf = he_daf.replace(u"\u05f4", u"")
	he_daf = he_daf.replace(u"׳", u"")
	he_daf = he_daf.replace(" ", "_")
	he_daf = he_daf + ".txt"
	f = open("../Noah-Santacruz-rashiPosting/Tosafot/"+he_daf, 'r')
	for line in f:
		line = line.replace("\n", "")
		something = line.replace(" ", "")
		if len(something) > 0:
			if count % 2 == 0:
				dh_dict[i+3].append(line)
			else:
				rashi_comments[i+3].append(line)
			count+=1
	f.close()

示例#55

0

显示文件

文件： out_of_order.py 项目： joshuagoldmeier/Sefaria-Data

		print 'line did not start with 11'

match_obj=Match(in_order=False, min_ratio=80, guess=False, range=False)

last_daf = max(comm_dict.keys())
param = "off"
for daf in comm_dict:
	if daf==last_daf:
		param = "on"
	send_text = {
				"versionTitle": "Maharam Shif on "+masechet,
				"versionSource": "http://www.sefaria.org",
				"language": "he",
				"text": comm_dict[daf],
				}
	post_text("Maharam Shif on "+masechet+"."+AddressTalmud.toStr("en", daf), send_text, param)


for category in categories:
  if category=='paragraph':
  	continue
  elif category=='gemara':
  	title = masechet
  elif category=='rashi':
  	title = "Rashi on "+masechet
  elif category=='tosafot':
  	title = "Tosafot on "+masechet
  	
  for daf in dh_dict[category]:
  	dh_arr = dh_dict[category][daf]
  	text = compileCommentaryIntoPage(title, daf)

示例#56

0

显示文件

文件： shita_bava_metzia.py 项目： joshuagoldmeier/Sefaria-Data

		pdb.set_trace()

last_daf = max(comm_dict.keys())
param = "off"
text_to_post = convertDictToArray(comm_dict)
send_text = {
			"versionTitle": "Shita Mekubetzet on "+masechet,
			"versionSource": "http://www.sefaria.org",
			"language": "he",
			"text": text_to_post,
			}
post_text("Shita Mekubetzet on "+masechet, send_text, "on")

links_to_post = []
for daf in dh_dict:
	text = get_text(masechet+"."+AddressTalmud.toStr("en", daf))
	match_obj=Match(in_order=True, min_ratio=85, guess=False, range=True)
	dh_arr = dh_dict[daf]
	result = match_obj.match_list(dh_arr, text, masechet+" "+AddressTalmud.toStr("en", daf))
	for key in result:
		line_n = result[key]
		line_n = line_n.replace("0:","")
		links_to_post.append({
				"refs": [
						 masechet+"."+AddressTalmud.toStr("en", daf)+"."+line_n, 
						"Shita Mekubetzet on "+masechet+"."+AddressTalmud.toStr("en", daf)+"."+str(key)
					],
				"type": "commentary",
				"auto": True,
				"generated_by": "Shita on "+masechet+" linker",
			 })

示例#57

0

显示文件

                comm_dict[daf] = []
            comm_dict[daf].append(comm)
            if just_added_dh == False:
                dh_dict[daf].append("")
            if hasTags(comm) or hasTags(dh) or hasTags(before_dh):
                pdb.set_trace()
        just_added_dh = False
        prev_line = line
result = {}
guess = 0
no_guess = 0
for daf in dh_dict.keys():
    if len(dh_dict[daf]) != len(comm_dict[daf]):
        pdb.set_trace()
for daf in dh_dict.keys():
    text = get_text("Niddah." + AddressTalmud.toStr("en", daf))
    try:
        match_obj = Match(in_order=True,
                          min_ratio=70,
                          guess=False,
                          range=True,
                          maxLine=len(text) - 1)
    except:
        pdb.set_trace()
    dh_arr = []
    for i in range(len(dh_dict[daf])):
        if len(dh_dict[daf][i]) > 0:
            dh_arr.append(dh_dict[daf][i])
    result[daf] = match_obj.match_list(dh_arr, text)
    dh_count = 1
    '''

示例#58

0

显示文件

文件： parse.py 项目： maxrabin/Sefaria-Data

    def postLinks(self, masechet):
        def base_tokenizer(str):
            str = re.sub(ur"\([^\(\)]+\)", u"", str)
            word_list = re.split(ur"\s+", str)
            word_list = [w for w in word_list if w]  # remove empty strings
            return word_list

        rosh_results = []
        perek_key = {}
        for perek in sorted(self.dh_by_perek.keys()):
            tuples = filter(lambda x: x[0] is 'rosh', self.dh_by_perek[perek])
            if len(tuples) > 0:
                cats, dhs, dappim = zip(*tuples)
                #for each daf and dh pair, that's the key to get the perek
                for daf, dh in zip(list(dappim), list(dhs)):
                    perek_key[(daf, dh)] = perek
                base = Ref("Rosh on {} {}".format(masechet, perek)).text('he')
                assert len(base.text) > 0
                these_results = match_ref(
                    base,
                    list(dhs),
                    base_tokenizer,
                    dh_extract_method=self.dh_extract_method,
                    verbose=False,
                    with_num_abbrevs=False)['matches']
                assert len(tuples) is len(these_results)
                rosh_results.append(these_results)

        results = {}
        comments = {}

        for daf in sorted(self.dh1_dict.keys()):
            comments[daf] = {}
            results[daf] = {}
            for each_cat in self.categories:
                if each_cat == 'rosh':
                    continue
                comments[daf][each_cat] = self.dh_by_cat[each_cat][daf]
            for each_type in comments[daf]:
                if each_type == 'rosh':
                    continue
                results[daf][each_type] = []
                if len(comments[daf][each_type]) > 0:
                    base = self.getTC(each_type, daf, masechet)
                    if len(base.text) == 0:
                        self.comm_wout_base.write("{} {}: {}\n".format(
                            masechet, daf, each_type))
                        base = self.getTC(each_type, daf - 1, masechet)
                        combined_comments = comments[
                            daf - 1][each_type] + comments[daf][each_type]
                        if len(base.text) == 0:
                            print "Problem in {}".format(
                                AddressTalmud.toStr("en", daf))
                        else:
                            results[daf - 1][each_type] = match_ref(
                                base,
                                combined_comments,
                                base_tokenizer,
                                dh_extract_method=self.dh_extract_method,
                                verbose=False,
                                with_num_abbrevs=False)
                            results[daf -
                                    1][each_type] = self.convertToOldFormat(
                                        results[daf - 1][each_type])
                        self.dh1_dict[daf] = [
                            x for x in self.dh1_dict[daf] if x[0] != each_type
                        ]
                    else:
                        results[daf][each_type] = match_ref(
                            base,
                            comments[daf][each_type],
                            base_tokenizer,
                            dh_extract_method=self.dh_extract_method,
                            verbose=False,
                            with_num_abbrevs=False)
                        results[daf][each_type] = self.convertToOldFormat(
                            results[daf][each_type])

        prev_perek = 0
        for daf in sorted(self.dh1_dict.keys()):
            self.maharam_line = 0
            self.which_line = {
                "rashi": -1,
                "tosafot": -1,
                "rosh": -1,
                "ran": -1,
                "gemara": -1,
                "rashbam": -1
            }
            for category, dh in self.dh1_dict[daf]:
                if category == 'gemara':
                    self.Gemara(daf, results[daf])
                elif category == 'rosh':
                    perek = perek_key[(daf, dh)]
                    if perek > prev_perek:
                        self.rosh_line = -1
                    prev_perek = perek
                    self.Rosh(perek, daf, dh, rosh_results)
                else:
                    self.Commentary(daf, category, results[daf])

        post_link(self.links_to_post, server=self.server)
        self.comm_wout_base.close()

示例#59

0

显示文件

文件： rashba_niddah.py 项目： BenjaminKozuch/Sefaria-Data

				comm_dict[daf] = []
			comm_dict[daf].append(comm)
			if just_added_dh == False:
				dh_dict[daf].append("")
			if hasTags(comm) or hasTags(dh) or hasTags(before_dh):
				pdb.set_trace()
		just_added_dh = False
		prev_line = line
result = {}
guess=0
no_guess=0
for daf in dh_dict.keys():
	if len(dh_dict[daf]) != len(comm_dict[daf]):
		pdb.set_trace()
for daf in dh_dict.keys():
	text = get_text("Niddah."+AddressTalmud.toStr("en", daf))
	try:
		match_obj=Match(in_order=True, min_ratio=70, guess=False, range=True, maxLine=len(text)-1)
	except:
		pdb.set_trace()
	dh_arr = []
	for i in range(len(dh_dict[daf])):
		if len(dh_dict[daf][i]) > 0:
			dh_arr.append(dh_dict[daf][i])
	result[daf] = match_obj.match_list(dh_arr, text)
	dh_count = 1
	'''
	if len(dh_dict[daf][i]) == 0, then comm_dict[daf][i] gets added to comm_dict[daf][i-1]+"<br>"
	'''
	for i in range(len(comm_dict[daf])):
		 if (daf, i) in before_dh_dict:

示例#60

0

显示文件

文件： zohar_struct.py 项目： BenjaminKozuch/Sefaria-Data

		zohar_struct[vol_num].append([])
	first_line = True
	vol = open(vol_file, 'r')
	for line in vol:
		stray_tag = False
		blank_line = False
		no_spaces = line.replace(" ", "")
		no_return = no_spaces.replace("\n", "")
		if len(no_return)==0:
			blank_line = True
		if len(line.split(' '))==1 and (line.find('<b>')>=0 or line.find('</b>')>=0):
			stray_tag = True			
		if first_line == True:
			first_line = False
			if curr_parsha_file != "":
				curr_parsha_file.write('\n'+str(prev_vol+1)+":"+AddressTalmud.toStr("en", prev_daf+1)+":"+str(prev_para))
				curr_parsha_file.close()	
			if os.path.exists(english_parshiot[curr_parsha]) == True:
				os.remove(english_parshiot[curr_parsha])		
			curr_parsha_file = open(english_parshiot[curr_parsha], 'a')
			curr_parsha_file.write(str(vol_num+1)+":"+AddressTalmud.toStr("en", daf_count+2)+":1")  
			curr_parsha += 1
		elif blank_line==False and stray_tag==False:
			prev_prev_line = prev_line
			prev_line = current_line
			new_daf = line.find('דף')
			new_parsha = line.find('h1') #all parsha titles are surrounded by <h1> tags
			if new_daf >= 0 and len(line.split(' ')) < 6:  
				current_line = "daf"
				daf_count += 1
				zohar_struct[vol_num].append([])