Python clean_jagged_array示例，data_utilities.util.clean_jagged_array Python示例

示例#1

0

显示文件

文件： parseTeshuvotHaRashba.py 项目： JonMosenkis/Sefaria-Data

def parse_the_text(file_name_teshuvot, file_name_footnotes, dictionary):
    teshuvot_ja = function.parse(file_name_teshuvot)
    footnotes_ja = function.parse(file_name_footnotes)
    links = function.create_links(teshuvot_ja, dictionary)
    index_teshuvot = function.create_index(dictionary)
    index_footnotes = function.create_index(dictionary,footnotes, footnotes_hebrew)
    teshuvot_ja = util.clean_jagged_array(teshuvot_ja, ['\d+', '\+'])
    footnotes_ja = util.clean_jagged_array(footnotes_ja, ['\d+', '\+'])
    text_teshuvot = function.create_text(dictionary, teshuvot_ja)
    text_footnotes = function.create_text(dictionary, footnotes_ja)
    functions.post_index(index_teshuvot)
    functions.post_index(index_footnotes)
    functions.post_text_weak_connection('Teshuvot haRashba part {}'.format(dictionary['roman numeral']), text_teshuvot)
    functions.post_text_weak_connection('Footnotes to Teshuvot haRashba part {}'.format(dictionary['roman numeral']), text_footnotes)
    functions.post_link_weak_connection(links)

示例#2

0

显示文件

文件： chinuch.py 项目： JonMosenkis/Sefaria-Data

def post():
    minchat = {'name': 'Minchat Chinuch', 'text': produce_parsed_data(filename)}
    sefer = {'name': 'Sefer HaChinukh', 'text': Ref('Sefer HaChinukh').text('he').text}

    chinukh_links = find_links(minchat, sefer, grab_dh, u'<b>', u'</b>')

    with codecs.open('links.txt', 'w', 'utf-8') as outfile:
        for each_link in chinukh_links:
            outfile.write(u'{}\n'.format(each_link['refs']))

    alt = construct_alt_struct('Chinukh_by_Parsha.csv', 'Chinukh Mitzva names.csv')

    cleaned = util.clean_jagged_array(minchat['text'], [m_pattern, comment_pattern, u'@[0-9]{2}',
                                      u'\n', u'\r'])
    with codecs.open('parsed.txt', 'w', 'utf-8') as outfile:
        util.jagged_array_to_file(outfile, cleaned, [u'Mitzva', u'Seif', u'Paragraph'])

    full_text = {
        'versionTitle': 'Minchat Chinuch, Piotrków, 1902',
        'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001175092',
        'language': 'he',
        'text': cleaned
    }

    index = construct_index(alt)
    functions.post_index(index)
    functions.post_text('Minchat Chinuch', full_text)
    functions.post_link(chinukh_links)

示例#3

0

显示文件

def parse_the_text(file_name_teshuvot, file_name_footnotes, dictionary):
    teshuvot_ja = function.parse(file_name_teshuvot)
    footnotes_ja = function.parse(file_name_footnotes)
    links = function.create_links(teshuvot_ja, dictionary)
    index_teshuvot = function.create_index(dictionary)
    index_footnotes = function.create_index(dictionary, footnotes,
                                            footnotes_hebrew)
    teshuvot_ja = util.clean_jagged_array(teshuvot_ja, ['\d+', '\+'])
    footnotes_ja = util.clean_jagged_array(footnotes_ja, ['\d+', '\+'])
    text_teshuvot = function.create_text(dictionary, teshuvot_ja)
    text_footnotes = function.create_text(dictionary, footnotes_ja)
    functions.post_index(index_teshuvot)
    functions.post_index(index_footnotes)
    functions.post_text_weak_connection(
        'Teshuvot haRashba part {}'.format(dictionary['roman numeral']),
        text_teshuvot)
    functions.post_text_weak_connection(
        'Footnotes to Teshuvot haRashba part {}'.format(
            dictionary['roman numeral']), text_footnotes)
    functions.post_link_weak_connection(links)

示例#4

0

显示文件

def parse():
    book_names = library.get_indexes_in_category('Torah')
    names = node_names()
    parsed = {}
    for book_num, filename in enumerate(filenames()):
        with codecs.open(filename, 'r', 'utf-8') as infile:
            current = util.file_to_ja([[[]]], infile, [u'@88', u'@44'], sefat_parse_helper).array()
            parsed[book_names[book_num]] = util.clean_jagged_array(current, [u'@[0-9]{2}', u'\?'])
    for book in book_names:
        parashot = names[book].keys()
        parsed[book] = util.simple_to_complex(parashot, parsed[book])
        for parsha in parashot:
            parsed[book][parsha] = util.simple_to_complex(names[book][parsha], parsed[book][parsha])

    return parsed

示例#5

0

显示文件

def parse():
    book_names = library.get_indexes_in_category('Torah')
    names = node_names()
    parsed = {}
    for book_name, filename in zip(book_names, filenames()):
        with codecs.open(filename, 'r', 'utf-8') as infile:
            current = util.file_to_ja(2, infile, [u'@88'],
                                      sefat_parse_helper).array()
            parsed[book_name] = util.clean_jagged_array(
                current, [u'@[0-9]{2}', u'\?'])
    for book in book_names:
        parashot = names[book].keys()
        parsed[book] = util.simple_to_complex(parashot, parsed[book])

    return parsed

示例#6

0

显示文件

文件： parse_boaz.py 项目： smontagu/Sefaria-Data

def parse_and_post(filename, index_key):

    with codecs.open(filename, 'r', 'utf-8') as source_file:
        data = util.file_to_ja([[]], source_file, [u'@00'], structure_boaz)
        data = util.clean_jagged_array(data.array(), strip_list)
        source_file.seek(0)
        data = align_boaz_chapters(source_file, data)

    text_version = {
        'versionTitle': u'Mishnah, ed. Romm, Vilna 1913',
        'versionSource':
        'http://http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001741739',
        'language': 'he',
        'text': data
    }
    functions.post_text(index_key, text_version)

示例#7

0

显示文件

文件： chinuch.py 项目： smontagu/Sefaria-Data

def post():
    minchat = {
        'name': 'Minchat Chinuch',
        'text': produce_parsed_data(filename)
    }
    sefer = {
        'name': 'Sefer HaChinukh',
        'text': Ref('Sefer HaChinukh').text('he').text
    }

    chinukh_links = find_links(minchat, sefer, grab_dh, u'<b>', u'</b>')

    with codecs.open('links.txt', 'w', 'utf-8') as outfile:
        for each_link in chinukh_links:
            outfile.write(u'{}\n'.format(each_link['refs']))

    alt = construct_alt_struct('Chinukh_by_Parsha.csv',
                               'Chinukh Mitzva names.csv')

    cleaned = util.clean_jagged_array(
        minchat['text'],
        [m_pattern, comment_pattern, u'@[0-9]{2}', u'\n', u'\r'])
    with codecs.open('parsed.txt', 'w', 'utf-8') as outfile:
        util.jagged_array_to_file(outfile, cleaned,
                                  [u'Mitzva', u'Seif', u'Paragraph'])

    full_text = {
        'versionTitle': 'Minchat Chinuch, Piotrków, 1902',
        'versionSource':
        'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001175092',
        'language': 'he',
        'text': cleaned
    }

    index = construct_index(alt)
    functions.post_index(index)
    functions.post_text('Minchat Chinuch', full_text)
    functions.post_link(chinukh_links)

示例#8

0

显示文件

文件： parse_mishna.py 项目： JonMosenkis/Sefaria-Data

    """

    tractate = {
        "versionTitle": "Vilna Mishna",
        "versionSource": "http://www.daat.ac.il/encyclopedia/value.asp?id1=836",
        "language": "he",
        "text": text,
    }
    print 'uploading {}'.format(text_name)
    functions.post_text(text_name, tractate)


trello = open('trello_board.json')
tracs = get_cards_from_trello('Parse Mishnah', trello)
trello.close()
for trac in tracs:
    name = re.search(u'[\u05d0-\u05ea].+', trac)
    name = name.group().replace(u'משנה', u'משניות')
    infile = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8')
    jagged = jaggedarray_from_file(infile, u'@00(?:\u05e4\u05e8\u05e7 |\u05e4)([\u05d0-\u05ea"]{1,3})',
                                   u'@22[\u05d0-\u05ea]{1,2}', u'@99')
    parsed = util.clean_jagged_array(jagged, clean_list())
    infile.close()

    en_name = re.search(u'[a-zA-Z ]+', trac).group().rstrip()
    if en_name not in tractates:
        print '{} not a valid ref'.format(en_name)
        sys.exit(1)
    upload(parsed, en_name)

示例#9

0

显示文件

def test_clean_jagged_array():
    dirty_ja = [['foo&', 'bar&'], ['hello&', 'world&']]
    clean_ja = [['foo', 'bar'], ['hello', 'world']]
    result = util.clean_jagged_array(dirty_ja, ['&'])
    assert result == clean_ja
    assert result != dirty_ja

示例#10

0

显示文件

"""
parse the text
create links
create index
clean the parse
create text
post
"""
import codecs
import regex
from sefaria.model import *
from data_utilities import util
from sources.Yad_Ramah import function
from sources import functions

sanhedrin_ja = TextChunk(Ref('Sanhedrin'), 'he').text
yad_ramah = function.parse('yad_ramah.txt')
yad_ramah = util.clean_jagged_array(yad_ramah, ['(@22)', '(@100)'])
index = function.create_index()
text = function.create_text(yad_ramah)
links = function.create_links(sanhedrin_ja, yad_ramah)
functions.post_index(index)
functions.post_text('Yad Ramah on Sanhedrin', text)
functions.post_link(links)

# hello = codecs.open("hello.txt", 'w', 'utf-8')
# util.jagged_array_to_file(hello, yad_ramah,['Page', 'Comment'])
# hello.close()

示例#11

0

显示文件

文件： Yad_Ramah_on_Sanhedrin_parse.py 项目： joshuagoldmeier/Sefaria-Data

"""
parse the text
create links
create index
clean the parse
create text
post
"""
import codecs
import regex
from sefaria.model import *
from data_utilities import util
from sources.Yad_Ramah import function
from sources import functions


sanhedrin_ja = TextChunk(Ref('Sanhedrin'), 'he').text
yad_ramah = function.parse('yad_ramah.txt')
yad_ramah = util.clean_jagged_array(yad_ramah, ['(@22)', '(@100)'])
index = function.create_index()
text = function.create_text(yad_ramah)
links = function.create_links(sanhedrin_ja, yad_ramah)
functions.post_index(index)
functions.post_text('Yad Ramah on Sanhedrin', text)
functions.post_link(links)



# hello = codecs.open("hello.txt", 'w', 'utf-8')
# util.jagged_array_to_file(hello, yad_ramah,['Page', 'Comment'])
# hello.close()

示例#12

0

显示文件

文件： parse_mishna.py 项目： smontagu/Sefaria-Data

    tractate = {
        "versionTitle": "Vilna Mishna",
        "versionSource":
        "http://www.daat.ac.il/encyclopedia/value.asp?id1=836",
        "language": "he",
        "text": text,
    }
    print 'uploading {}'.format(text_name)
    functions.post_text(text_name, tractate)


trello = open('trello_board.json')
tracs = get_cards_from_trello('Parse Mishnah', trello)
trello.close()
for trac in tracs:
    name = re.search(u'[\u05d0-\u05ea].+', trac)
    name = name.group().replace(u'משנה', u'משניות')
    infile = codecs.open(u'{}.txt'.format(name), 'r', 'utf-8')
    jagged = jaggedarray_from_file(
        infile, u'@00(?:\u05e4\u05e8\u05e7 |\u05e4)([\u05d0-\u05ea"]{1,3})',
        u'@22[\u05d0-\u05ea]{1,2}', u'@99')
    parsed = util.clean_jagged_array(jagged, clean_list())
    infile.close()

    en_name = re.search(u'[a-zA-Z ]+', trac).group().rstrip()
    if en_name not in tractates:
        print '{} not a valid ref'.format(en_name)
        sys.exit(1)
    upload(parsed, en_name)