예제 #1
0
def get_perakim(type, tag, tag_reg):
    """
    :param type: identifies if this is Mishnah, yachin or boaz
    :param tag: the tag to identify the start of a new perek
    :param tag_reg: regular expression for the tag
    :return: a dictionary, keys are the tractate, values are a list of perakim
    """

    # get a list of all tractates
    tractates = library.get_indexes_in_category('Mishnah')
    results = {}

    for tractate in tractates:
        ref = Ref(tractate)
        name = ref.he_book()
        name = name.replace(u'משנה', type)
        file_name = u'{}.txt'.format(name)

        # if file doesn't exist, skip
        if not os.path.isfile(file_name):
            continue

        text_file = codecs.open(file_name, 'r', 'utf-8')

        data_tag = TagTester(tag, text_file, tag_reg, name)
        results[name] = data_tag.grab_by_section()

        text_file.close()
예제 #2
0
def get_perakim(type, tag, tag_reg):
    """
    :param type: identifies if this is Mishnah, yachin or boaz
    :param tag: the tag to identify the start of a new perek
    :param tag_reg: regular expression for the tag
    :return: a dictionary, keys are the tractate, values are a list of perakim
    """

    # get a list of all tractates
    tractates = library.get_indexes_in_category('Mishnah')
    results = {}

    for tractate in tractates:
        ref = Ref(tractate)
        name = ref.he_book()
        name = name.replace(u'משנה', type)
        file_name = u'{}.txt'.format(name)

        # if file doesn't exist, skip
        if not os.path.isfile(file_name):
            continue

        text_file = codecs.open(file_name, 'r', 'utf-8')

        data_tag = TagTester(tag, text_file, tag_reg, name)
        results[name] = data_tag.grab_by_section()

        text_file.close()
예제 #3
0
def checkDappim(files):
    errors = open('daf_issues.txt', 'w')
    for file in files:
        print file
        flagged = []
        errors.write("\n" + file + "\n")
        reg = u'@22\[?[\u05d0-\u05ea\s"]+\]?'
        open_file = open(file + ".txt")
        tt = TagTester("@22", open_file, reg=reg)
        num_array, string_array = tt.daf_processor()
        prev_value = 2
        for count, this_value in enumerate(num_array):
            if this_value - prev_value <= 0:
                flagged.append(string_array[count])
            prev_value = this_value
        errors.write("Flagged mistakes: " + "\n")
        flagged_str = ""
        for each_one in flagged:
            flagged_str += each_one.replace("\n", "").replace("@22",
                                                              "") + ",  "
        errors.write(flagged_str.encode('utf-8') + "\n")
        errors.write("All Dappim in this Masechet: " + "\n")
        dappim_str = ""
        for each_one in string_array:
            dappim_str += each_one.replace("\n", "").replace("@22",
                                                             "") + ",   "
        errors.write(dappim_str.encode('utf-8') + "\n")
    errors.close()
예제 #4
0
def compare_mishna_to_yachin(tractate_list):

    for tractate in tractate_list:
        r = Ref(tractate)
        name = r.he_book()
        m_name = name.replace(u'משנה', u'משניות')
        y_name = name.replace(u'משנה', u'יכין')
        output = codecs.open('tag_match_up.txt', 'a', 'utf-8')
        try:
            m_file = codecs.open(u'{}.txt'.format(m_name), 'r', 'utf-8')
            y_file = codecs.open(u'{}.txt'.format(y_name), 'r', 'utf-8')
        except IOError:
            output.write(u'missing file {}\n'.format(name))
            continue

        m_tag = TagTester(u'@44', m_file, name=m_name)
        y_tag = TagTester(u'@11', y_file, name=y_name)

        seg_tag = u'@00(?:פרק |פ)([א-ת,"]{1,3})'
        m_tag.segment_tag = seg_tag
        y_tag.segment_tag = seg_tag

        compare_tags_to_comments(m_tag, y_tag, output)

        m_file.close()
        y_file.close()
        output.close()
예제 #5
0
def check_segments():

    segments = []

    infile = codecs.open(filename, 'r', 'utf-8')

    headers = TagTester(u'@30', infile,
                        u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header()
    tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)')

    while not tester.eof:

        segments.append(
            tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1))

    infile.close()

    for sec_number, section in enumerate(segments):

        index = 1

        for title in section:

            title = title.replace(u'"', u'')
            count = util.getGematria(title)

            if count != index:

                print headers[sec_number - 1]
                print util.numToHeb(index)
                index = count
            index += 1
예제 #6
0
def checkDappim(files):
	errors = open('daf_issues.txt', 'w')
	for file in files:
		print file
		flagged = []
		errors.write("\n"+file+"\n")
		reg = u'@22\[?[\u05d0-\u05ea\s"]+\]?'
		open_file = open(file+".txt")
		tt = TagTester("@22", open_file, reg=reg)
		num_array, string_array = tt.daf_processor()
		prev_value = 2
		for count, this_value in enumerate(num_array):
			if this_value - prev_value <= 0:
				flagged.append(string_array[count])
			prev_value = this_value
		errors.write("Flagged mistakes: "+"\n")
		flagged_str = ""
		for each_one in flagged:
			flagged_str += each_one.replace("\n","").replace("@22", "")+",  "
		errors.write(flagged_str.encode('utf-8')+"\n")
		errors.write("All Dappim in this Masechet: "+"\n")
		dappim_str = ""
		for each_one in string_array:
			dappim_str += each_one.replace("\n", "").replace("@22", "")+",   "
		errors.write(dappim_str.encode('utf-8')+"\n")
	errors.close()
예제 #7
0
def test_insert_chapters(filename, expected):
    with codecs.open(filename, 'r', 'utf-8') as infile:
        tester = TagTester(u'@22', infile, u'^@22\u05d0( |$)')
        if len(tester.grab_each_header()) == expected:
            return True
        else:
            return False
예제 #8
0
def test_insert_chapters(filename, expected):
    with codecs.open(filename, 'r', 'utf-8') as infile:
        tester = TagTester(u'@22', infile, u'^@22\u05d0( |$)')
        if len(tester.grab_each_header()) == expected:
            return True
        else:
            return False
예제 #9
0
def check_segments():

    segments = []

    infile = codecs.open(filename, 'r', 'utf-8')

    headers = TagTester(u'@30', infile, u'@30מצוה ([\u05d0-\u05ea"]{1,5})').grab_each_header()
    tester = TagTester(u'@44', infile, u'@44\(([\u05d0-\u05ea]{1,2})\)')

    while not tester.eof:

        segments.append(tester.grab_each_header(u'@30מצוה ([\u05d0-\u05ea"]{1,5})', 1))

    infile.close()

    for sec_number, section in enumerate(segments):

        index = 1

        for title in section:

            title = title.replace(u'"', u'')
            count = util.getGematria(title)

            if count != index:

                print headers[sec_number-1]
                print util.numToHeb(index)
                index = count
            index += 1
예제 #10
0
def checkPerakim():
	files = ["Yoma"]
	for file in files:
		print file
		reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)'
		open_file = open(file)
		tt = TagTester("@00", open_file, reg=reg)
		tt.in_order_one_section(1, perek_checker)
예제 #11
0
def checkPerakim():
	for file in glob.glob(u"*.txt"):
		if file.find("intro") == -1:
			print file
			reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)'
			open_file = open(file)
			tt = TagTester("@00", open_file, perek_checker, reg=reg)
			tt.in_order_one_section(1)
예제 #12
0
def checkPerakim():
    files = ["Yoma"]
    for file in files:
        print file
        reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)'
        open_file = open(file)
        tt = TagTester("@00", open_file, reg=reg)
        tt.in_order_one_section(1, perek_checker)
예제 #13
0
def checkMishnayot():
	for file in glob.glob(u"*.txt"):
		if file.find("intro") == -1:
			print file
			reg = u'@22.*?[\u05d0-\u05ea]+.*?'
			open_file = open(file)
			tt = TagTester("@22", open_file, perek_checker, reg=reg)
			tt.in_order_many_sections(end_tag="@00")
예제 #14
0
def checkPerakim(files):
	for file in files:
		if file == "Berakhot":
			continue
		print file
		reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)'
		open_file = open(file+".txt")
		tt = TagTester("@00", open_file, reg=reg)
		result = tt.in_order_one_section(1, perek_checker)
		print result
예제 #15
0
def checkPerakim():
    for file in glob.glob(u"*.txt"):
        if file.find("intro") == -1:
            print file
            reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)'
            open_file = open(file)
            tt = TagTester("@00", open_file, reg=reg)
            result = tt.in_order_one_section(1)
            if result[0] != "SUCCESS":
                pdb.set_trace()
예제 #16
0
def checkPerakim(files):
    for file in files:
        if file == "Berakhot":
            continue
        print file
        reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)'
        open_file = open(file + ".txt")
        tt = TagTester("@00", open_file, reg=reg)
        result = tt.in_order_one_section(1, perek_checker)
        print result
예제 #17
0
def checkPerakim():
	for file in glob.glob(u"*.txt"):
		if file.find("intro") == -1:
			print file
			reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)'
			open_file = open(file)
			tt = TagTester("@00", open_file, reg=reg)
			result = tt.in_order_one_section(1)
			if result[0] != "SUCCESS":
				pdb.set_trace()
예제 #18
0
def checkMishnayot():
    for file in glob.glob(u"*.txt"):
        if file.find("intro") == -1:
            print file
            reg = u'@22.*?[\u05d0-\u05ea]+.*?'
            open_file = open(file)
            tt = TagTester("@22", open_file, reg=reg)
            result = tt.in_order_many_sections(end_tag="@00")
            if result[0] != "SUCCESS":
                pdb.set_trace()
예제 #19
0
def checkMishnayot():
	for file in glob.glob(u"*.txt"):
		if file.find("intro") == -1:
			print file
			reg = u'@22.*?[\u05d0-\u05ea]+.*?'
			open_file = open(file)
			tt = TagTester("@22", open_file, reg=reg)
			result = tt.in_order_many_sections(end_tag="@00")
			if result[0] != "SUCCESS":
				pdb.set_trace()
예제 #20
0
def check_chapters():
    cards = get_cards()
    good_files, bad_files = [], []
    for card in cards:
        m_ref = Ref(card.replace('Rambam ', ''))
        with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile:
            tester = TagTester(u'@00', infile, u'@00\u05e4\u05e8\u05e7')
            tags = tester.grab_each_header()
        if len(tags) == len(
                m_ref.all_subrefs()) or card == 'Rambam Pirkei Avot':
            good_files.append(card)
        else:
            bad_files.append(card)
    return {'good': good_files, 'bad': bad_files}
예제 #21
0
def check_chapters():
    with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch:
        test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})')

        index = 1

        for header in test.grab_each_header(capture_group=1):

            header = header.replace(u'"', u'')
            count = util.getGematria(header)

            if count != index:
                print util.numToHeb(index)
                index = count
            index += 1
예제 #22
0
def check_chapters():
    with codecs.open('Minchat_Chinuch.txt', 'r', 'utf-8') as chinuch:
        test = TagTester(u'@30', chinuch, u'@30מצוה ([\u05d0-\u05ea"]{1,5})')

        index = 1

        for header in test.grab_each_header(capture_group=1):

            header = header.replace(u'"', u'')
            count = util.getGematria(header)

            if count != index:
                print util.numToHeb(index)
                index = count
            index += 1
예제 #23
0
def check_chapters():
    cards = get_cards()
    good_files, bad_files = [], []
    for card in cards:
        m_ref = Ref(card.replace('Rambam ', ''))
        with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile:
            tester = TagTester(u'@00', infile, u'@00\u05e4\u05e8\u05e7')
            tags = tester.grab_each_header()
        if len(tags) == len(m_ref.all_subrefs()) or card == 'Rambam Pirkei Avot':
            good_files.append(card)
        else:
            bad_files.append(card)
    return {
        'good': good_files,
        'bad': bad_files
    }
예제 #24
0
def check_tags_on_category(category, tag, tag_regex, check_function):
    """
    Check that all the tags in category run in order
    :param category: משניות, יכין or whatever is needed to identify the files
    """

    output = codecs.open(u'{}_tags.txt'.format(category), 'w', 'utf-8')
    seg_reg = u'@00(?:פרק |פ)([א-ת,"]{1,3})'

    for tractate in tractates:
        ref = Ref(tractate)
        name = ref.he_book()
        name = name.replace(u'משנה', category)
        try:
            in_file = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8')
        except IOError:
            output.write(u'{}.txt does not exist\n'.format(name))
            continue

        # create TagTester object for each file
        tag_object = TagTester(tag, in_file, tag_regex, name)

        # get tags in array
        whole_book = get_tags_by_perek(tag_object, seg_reg, 1)
        perfect = True

        for index, perek in enumerate(whole_book):
            message = u'{} פרק {}'.format(name, index + 1)
            if not check_function(perek, message, output):
                perfect = False

        if perfect:
            output.write(u'{}-אין בעיות\n'.format(name))

    output.close()
예제 #25
0
def tag_matches_regex(exact_tag, expression, output_file_name):
    """
    Boaz tags are all over the place. Given a tag, make sure all appearances of a tag can be
    grabbed by regular expression.
    :param exact_tag: Exact string defining a tag (e.g. @00)
    :param expression: Regular expression with which to grab the tag
    :param output_file_name: file to write results
    """

    results = codecs.open(output_file_name, 'w', 'utf-8')

    books = library.get_indexes_in_category('Mishnah')

    for book in books:
        name = Ref(book).he_book().replace(u'משנה', u'בועז')

        if not os.path.isfile(u'{}.txt'.format(name)):
            results.write(u'missing boaz {}\n'.format(book))
            continue

        input_file = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8')
        tester = TagTester(exact_tag, input_file, expression)

        count = 0
        for match in tester.types.keys():
            count += tester.types[match]

        results.write(u'{} found {} issues\n'.format(
            name, tester.appearances - count))
        input_file.close()

    results.close()
예제 #26
0
def get_TYT_perek_lengths():
    TYT_lengths = {}
    for file in glob.glob(u"*.txt"):
        if file.find("intro") == -1:
            reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)'
            open_file = open(file)
            tt = TagTester("@00", open_file, reg=reg)
            TYT_perakim = tt.in_order_one_section(1)
            if TYT_perakim[0] == "SUCCESS":
                len_TYT_perakim = len(TYT_perakim[1])
            if file.find("avot") >= 0:
                mishnah_name = "Pirkei Avot"
            else:
                mishnah_name = "Mishnah " + file.replace(".txt", "").title()
            mishnah_name = mishnah_name.replace("_", " ")
            TYT_lengths[mishnah_name] = len_TYT_perakim
    return TYT_lengths
예제 #27
0
def get_TYT_perek_lengths():
	TYT_lengths = {}
	for file in glob.glob(u"*.txt"):
		if file.find("intro") == -1:
			reg = u'(?:@00\u05e4\u05e8\u05e7 |@00\u05e4")([\u05d0-\u05ea]+)'
			open_file = open(file)
			tt = TagTester("@00", open_file, reg=reg)
			TYT_perakim = tt.in_order_one_section(1)
			if TYT_perakim[0] == "SUCCESS":
				len_TYT_perakim = len(TYT_perakim[1])
			if file.find("avot") >= 0:
				mishnah_name = "Pirkei Avot"
			else:
	 			mishnah_name = "Mishnah "+file.replace(".txt", "").title()
	 		mishnah_name = mishnah_name.replace("_", " ")
			TYT_lengths[mishnah_name] = len_TYT_perakim
	return TYT_lengths
예제 #28
0
def check_mishnayot():
    cards = get_cards()
    success, failure = [], []
    for card in cards:
        with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile:
            tester = TagTester(u'@22', infile, u'@22([\u05d0-\u05ea]{1,2})')
            result = tester.in_order_many_sections(end_tag=u'@00', capture_group=1)
        if result[0] == 'SUCCESS':
            success.append(card)
        else:
            print 'failure: {}'.format(card)
            print len(result[1])

    print 'successes: {}'.format(len(success))
    print 'failures: {}'.format(len(failure))
    print 'total: {}'.format(len(cards))
    for item in failure:
        print item
예제 #29
0
def check_mishnayot():
    cards = get_cards()
    success, failure = [], []
    for card in cards:
        with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile:
            tester = TagTester(u'@22', infile, u'@22([\u05d0-\u05ea]{1,2})')
            result = tester.in_order_many_sections(end_tag=u'@00',
                                                   capture_group=1)
        if result[0] == 'SUCCESS':
            success.append(card)
        else:
            print 'failure: {}'.format(card)
            print len(result[1])

    print 'successes: {}'.format(len(success))
    print 'failures: {}'.format(len(failure))
    print 'total: {}'.format(len(cards))
    for item in failure:
        print item
예제 #30
0
def compare_mishna_to_yachin(tractate_list):

    for tractate in tractate_list:
        r = Ref(tractate)
        name = r.he_book()
        m_name = name.replace(u'משנה', u'משניות')
        y_name = name.replace(u'משנה', u'יכין')
        output = codecs.open('tag_match_up.txt', 'a', 'utf-8')
        try:
            m_file = codecs.open(u'{}.txt'.format(m_name), 'r', 'utf-8')
            y_file = codecs.open(u'{}.txt'.format(y_name), 'r', 'utf-8')
        except IOError:
            output.write(u'missing file {}\n'.format(name))
            continue

        m_tag = TagTester(u'@44', m_file, name=m_name)
        y_tag = TagTester(u'@11', y_file, name=y_name)

        seg_tag = u'@00(?:פרק |פ)([א-ת,"]{1,3})'
        m_tag.segment_tag = seg_tag
        y_tag.segment_tag = seg_tag

        compare_tags_to_comments(m_tag, y_tag, output)

        m_file.close()
        y_file.close()
        output.close()
예제 #31
0
def get_num_TYTs_per_perek():
	num_TYTs = {}
	actual_TYTs = {}
	for file in glob.glob(u"*.txt"):
		if file.find("intro") == -1:
			reg = u'@22.*?[\u05d0-\u05ea]+.*?'
			open_file = open(file)
			tt = TagTester("@22", open_file, reg=reg)
			headers = tt.in_order_many_sections(end_tag="@00")
			if headers[0] == "SUCCESS":
				headers = headers[1]
			else:
				pdb.set_trace()
			if file.find("avot") >= 0:
				masechet = "Pirkei Avot"
			else:
	 			masechet = "Mishnah "+file.replace(".txt", "").replace("_"," ").title()
	 		num_TYTs[masechet] = []
	 		actual_TYTs[masechet] = headers
	 		for perek in headers:
	 			num_TYTs[masechet].append(len(perek))
	return num_TYTs, actual_TYTs
예제 #32
0
def tag_starts_line(tag, category):
    """
    Make sure a tag always begins a new line
    :param tag: regular expression with which to find tag
    :param category: Identifier for the files (i.e משניות, יכין etc.)
    """

    for tractate in tractates:
        ref = Ref(tractate)
        name = ref.he_book()
        name = name.replace(u'משנה', category)
        try:
            in_file = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8')
        except IOError:
            print u'cannot find {}'.format(name)
            continue

        # instantiate TagTester
        tester = TagTester(tag, in_file)
        if tester.does_start_line():
            print u'{} is okay!'.format(name)
        else:
            print u'problem with {}'.format(name)
예제 #33
0
def tag_starts_line(tag, category):
    """
    Make sure a tag always begins a new line
    :param tag: regular expression with which to find tag
    :param category: Identifier for the files (i.e משניות, יכין etc.)
    """

    for tractate in tractates:
        ref = Ref(tractate)
        name = ref.he_book()
        name = name.replace(u'משנה', category)
        try:
            in_file = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8')
        except IOError:
            print u'cannot find {}'.format(name)
            continue

        # instantiate TagTester
        tester = TagTester(tag, in_file)
        if tester.does_start_line():
            print u'{} is okay!'.format(name)
        else:
            print u'problem with {}'.format(name)
예제 #34
0
def get_num_TYTs_per_perek():
    num_TYTs = {}
    actual_TYTs = {}
    for file in glob.glob(u"*.txt"):
        if file.find("intro") == -1:
            reg = u'@22.*?[\u05d0-\u05ea]+.*?'
            open_file = open(file)
            tt = TagTester("@22", open_file, reg=reg)
            headers = tt.in_order_many_sections(end_tag="@00")
            if headers[0] == "SUCCESS":
                headers = headers[1]
            else:
                pdb.set_trace()
            if file.find("avot") >= 0:
                masechet = "Pirkei Avot"
            else:
                masechet = "Mishnah " + file.replace(".txt", "").replace(
                    "_", " ").title()
            num_TYTs[masechet] = []
            actual_TYTs[masechet] = headers
            for perek in headers:
                num_TYTs[masechet].append(len(perek))
    return num_TYTs, actual_TYTs
예제 #35
0
def check_chapters(category, chap_reg):

    output = codecs.open('chapters.txt', 'w', 'utf-8')

    for tractate in tractates:
        ref = Ref(tractate)
        name = ref.he_book()
        name = name.replace(u'משנה', category)
        try:
            in_file = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8')
        except IOError:
            output.write(u'{}.txt does not exist\n'.format(name))
            continue

        chap_tag = TagTester(u'@00', in_file, chap_reg, name)
        chapters = get_tags_by_perek(chap_tag, chap_tag.reg, capture_group=1)

        if len(chapters) != len(ref.all_subrefs()):
            output.write(u'Chapter mismatch {}\n'.format(tractate))

    output.close()
예제 #36
0
def test_accercy(tag, filename):
    with codecs.open(filename, 'r', 'utf-8') as fp:
        tag_tester = TagTester(tag, fp)
        appearences = tag_tester.appearances
        print appearences