Exemplo n.º 1
0
def test_file_to_ja():
    data = StringIO('''@22\nfoo\nbar\n@22\nhello\nworld''')
    ja = util.file_to_ja(2, data, ['@22'], lambda x: [c.rstrip() for c in x])
    assert ja.array() == [
        ['foo', 'bar'],
        ['hello', 'world']
    ]
Exemplo n.º 2
0
def parse_raph(filename, smk_ja):
    '''

    :param filename: raph source txt file
    :param smk_ja: JA obj smk parsed [siman,segment]
    :return: JA obj parsed [siman, letter] some simanim will be empty
    '''

    def cleaner(my_text):
        replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'',
                        u'@(33|22)': u''}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line) and not line.isspace():
            line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    try:
        ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array()
    except AttributeError:
        print 'there are more regs then levels...'

    ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml')

    d1 = 0
    aligned = []
    siman = []
    segment = []
    for letter in smk_ja.array():
        for seg in letter:
            for ff in re.finditer(u'@55[\u05d0-\u05ea]{0,3}', seg):
                # segment.append(ja[d1])
                siman.append(ja[d1])
                d1 += 1
            if segment != []:
                siman.extend(segment) #rather then append
                # segment = []
        aligned.append(siman)
        siman = []
    ja_to_xml(aligned, ['siman', 'letter', 'segment'], 'raph_simanim_24.xml')
    return JaggedArray(aligned)
Exemplo n.º 3
0
def parse_yitzira():

    def cleaner(my_text):
        return filter(None,
                      [re.sub(u'@[0-9]{2}', u'', line) if re.search(u'@11', line) else None for line in my_text])

    with codecs.open('yitzira_mishna.txt', 'r', 'utf-8') as infile:
        return file_to_ja(2, infile, [u'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}'], cleaner).array()
Exemplo n.º 4
0
def parse_body():

    def cleaner(section):
        bleached = [bleach.clean(segment, tags=[], strip=True) for segment in section]
        return filter(lambda x: None if len(x) == 0 else x, bleached)

    my_text = get_text().splitlines()[:1795]  # A new part begins at this line
    expressions = [u'<b>\u05d4?\u05de\u05e2\u05d9\u05d9?\u05df.*:</b>',
                   u'\u05de\u05e2\u05d9\u05df ([\u05d0-\u05ea]{1,2}) - \u05e0\u05d4\u05e8 (- )?([\u05d0-\u05ea]{1,2})']
    parsed = file_to_ja(3, my_text, expressions, cleaner)
    return parsed.array()
Exemplo n.º 5
0
def parse_yitzira():
    def cleaner(my_text):
        return filter(None, [
            re.sub(u'@[0-9]{2}', u'', line)
            if re.search(u'@11', line) else None for line in my_text
        ])

    with codecs.open('yitzira_mishna.txt', 'r', 'utf-8') as infile:
        return file_to_ja(2, infile,
                          [u'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}'],
                          cleaner).array()
Exemplo n.º 6
0
def parse_general(filename):

    def cleaner(my_text):
        result = []
        for line in my_text:
            new_line = multiple_replace(line, {u'@31': u'<b>', u'@32': u'</b>'})
            new_line = re.sub(u'@[0-9]{2}', u'', new_line)
            result.append(new_line)
        return result

    regs = [u'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}', u'@22[\u05d0-\u05ea]{1,2}']
    with codecs.open(filename, 'r', 'utf-8') as infile:
        return file_to_ja(3, infile, regs, cleaner).array()
Exemplo n.º 7
0
def produce_parsed_data(filename):

    with codecs.open(filename, 'r', 'utf-8') as datafile:
        parsed = util.file_to_ja(3, datafile, (m_pattern, comment_pattern), nothing)

        datafile.seek(0)

        names = util.grab_section_names(m_pattern, datafile, 1)
        names = [int(util.getGematria(name)) for name in names]

    comp_text = util.simple_to_complex(names, parsed.array())
    parsed = util.convert_dict_to_array(comp_text)

    return parsed
Exemplo n.º 8
0
def parse():
    book_names = library.get_indexes_in_category('Torah')
    names = node_names()
    parsed = {}
    for book_num, filename in enumerate(filenames()):
        with codecs.open(filename, 'r', 'utf-8') as infile:
            current = util.file_to_ja([[[]]], infile, [u'@88', u'@44'], sefat_parse_helper).array()
            parsed[book_names[book_num]] = util.clean_jagged_array(current, [u'@[0-9]{2}', u'\?'])
    for book in book_names:
        parashot = names[book].keys()
        parsed[book] = util.simple_to_complex(parashot, parsed[book])
        for parsha in parashot:
            parsed[book][parsha] = util.simple_to_complex(names[book][parsha], parsed[book][parsha])

    return parsed
Exemplo n.º 9
0
def parse():
    book_names = library.get_indexes_in_category('Torah')
    names = node_names()
    parsed = {}
    for book_name, filename in zip(book_names, filenames()):
        with codecs.open(filename, 'r', 'utf-8') as infile:
            current = util.file_to_ja(2, infile, [u'@88'],
                                      sefat_parse_helper).array()
            parsed[book_name] = util.clean_jagged_array(
                current, [u'@[0-9]{2}', u'\?'])
    for book in book_names:
        parashot = names[book].keys()
        parsed[book] = util.simple_to_complex(parashot, parsed[book])

    return parsed
Exemplo n.º 10
0
def produce_parsed_data(filename):

    with codecs.open(filename, 'r', 'utf-8') as datafile:
        parsed = util.file_to_ja([[[]]], datafile,
                                 (m_pattern, comment_pattern), nothing)

        datafile.seek(0)

        names = util.grab_section_names(m_pattern, datafile, 1)
        names = [int(util.getGematria(name)) for name in names]

    comp_text = util.simple_to_complex(names, parsed.array())
    parsed = util.convert_dict_to_array(comp_text)

    return parsed
Exemplo n.º 11
0
def parse_and_post(filename, index_key):

    with codecs.open(filename, 'r', 'utf-8') as source_file:
        data = util.file_to_ja([[]], source_file, [u'@00'], structure_boaz)
        data = util.clean_jagged_array(data.array(), strip_list)
        source_file.seek(0)
        data = align_boaz_chapters(source_file, data)

    text_version = {
        'versionTitle': u'Mishnah, ed. Romm, Vilna 1913',
        'versionSource':
        'http://http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001741739',
        'language': 'he',
        'text': data
    }
    functions.post_text(index_key, text_version)
Exemplo n.º 12
0
def parse_general(filename):
    def cleaner(my_text):
        result = []
        for line in my_text:
            new_line = multiple_replace(line, {
                u'@31': u'<b>',
                u'@32': u'</b>'
            })
            new_line = re.sub(u'@[0-9]{2}', u'', new_line)
            result.append(new_line)
        return result

    regs = [
        u'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}',
        u'@22[\u05d0-\u05ea]{1,2}'
    ]
    with codecs.open(filename, 'r', 'utf-8') as infile:
        return file_to_ja(3, infile, regs, cleaner).array()
Exemplo n.º 13
0
def parse_Raph_by_letter(filename):
    '''parsing according to the letters, is the main ja, to post for the raph'''
    def cleaner(my_text):
        replace_dict = {
            u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'',
            u'@33': u''
        }  #{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line) and not line.isspace():
            line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    new_ja = regs_devide(cleaned, regs)
    try:
        # ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True,  grab_all=[True, True], group_name='gim').array()
        ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array()
    except AttributeError:
        print 'there are more regs then levels...'

    # ja_to_xml(new_ja, ['Alef', 'letter', 'segments'], 'raph_letters.xml')
    ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml')

    return ja
Exemplo n.º 14
0
def parse_Raph_by_letter(filename):
    '''parsing according to the letters, is the main ja, to post for the raph'''
    def cleaner(my_text):
        replace_dict = {u'@(?:11|77)[\u05d0-\u05ea]{0,3}': u'', u'@33': u''}#{u'@11(.*?)@12': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@33(.*?)@34': ur'<b>\1</b>', u'@66(.*?)@67': ur'\1'}
        new = []
        for line in my_text:
            line = multiple_replace(line, replace_dict, using_regex=True)
            new.append(line)
        return new

    regs = [ur'@11(?P<gim>[\u05d0-\u05ea]{1,3})']
    with codecs.open(filename, 'r', 'utf-8') as fp:
        lines = fp.readlines()
    starting = None
    # check if we got to the end of the legend and change to started
    # clean all lines of days start with @00
    cleaned = []
    for line_num, line in enumerate(lines):
        if line == u'\n':
            starting = line_num + 1
            break
    for line_num, line in enumerate(lines[starting:]):
        if not re.search(u'@00', line) and not line.isspace():
            line = re.split(u'(@11[\u05d0-\u05ea]{0,3})', line)
            if isinstance(line, basestring):
                cleaned.append(line)
            else:
                [cleaned.append(st.strip()) for st in line if st]
    new_ja = regs_devide(cleaned, regs)
    try:
        # ja = file_to_ja_g(2, cleaned, regs, cleaner, gimatria=True,  grab_all=[True, True], group_name='gim').array()
        ja = file_to_ja(2, cleaned, regs, cleaner, grab_all=False).array()
    except AttributeError:
        print 'there are more regs then levels...'

    # ja_to_xml(new_ja, ['Alef', 'letter', 'segments'], 'raph_letters.xml')
    ja_to_xml(ja, ['letter', 'segments'], 'raph_letters.xml')

    return ja
Exemplo n.º 15
0
    return index


def post_text_and_index(text_struct, section_names):

    index = build_index(section_names)
    functions.post_index(index)

    for section_num, section in enumerate(section_names):

        new_text = {
            "versionTitle": 'Noda BeYehuda Warsaw 1880',
            "versionSource": 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001983501',
            "language": 'he',
            "text": text_struct[section_num]
        }
        functions.post_text('Noda BeYehuda, {}'.format(section), new_text)

patterns = [u'@00', u'@22']
names = [u'חלק', u'סימן', u'טקסט']
section_names = ['Orach Chaim', 'Yoreh Deah', 'Even HaEzer', 'Choshen Mishpat']
parsed = util.file_to_ja([[[]]], noda_file, patterns, clean_and_align)
with codecs.open('testfile.txt', 'w', 'utf-8') as check_parse:
    util.jagged_array_to_file(check_parse, parsed.array(), names)

post_text_and_index(parsed.array(), section_names)

noda_file.close()
os.remove('errors.html')
Exemplo n.º 16
0

def post_text_and_index(text_struct, section_names):

    index = build_index(section_names)
    functions.post_index(index)

    for section_num, section in enumerate(section_names):

        new_text = {
            "versionTitle": 'Noda BeYehuda Warsaw 1880',
            "versionSource":
            'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001983501',
            "language": 'he',
            "text": text_struct[section_num]
        }
        functions.post_text('Noda BeYehuda, {}'.format(section), new_text)


patterns = [u'@00', u'@22']
names = [u'חלק', u'סימן', u'טקסט']
section_names = ['Orach Chaim', 'Yoreh Deah', 'Even HaEzer', 'Choshen Mishpat']
parsed = util.file_to_ja([[[]]], noda_file, patterns, clean_and_align)
with codecs.open('testfile.txt', 'w', 'utf-8') as check_parse:
    util.jagged_array_to_file(check_parse, parsed.array(), names)

post_text_and_index(parsed.array(), section_names)

noda_file.close()
os.remove('errors.html')