def file_to_ja(structure, infile, expressions, cleaner, grab_all=False): """ Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged Array in the desired structure (Chapter, verse, etc.) :param structure: A nested list one level lower than the final result. Example: for a depth 2 text, structure should be [[]]. :param infile: Text file to read from :param expressions: A list of regular expressions with which to identify segment (chapter) level. Do not include an expression with which to break up the actual text. :param cleaner: A function that takes a list of strings and returns an array with the text broken up correctly. Should also break up and remove unnecessary tagging data. :param grab_all: If set to true, will grab the lines indicating new sections. :return: A jagged_array with the text properly structured. """ # instantiate ja ja = jagged_array.JaggedArray(structure) if structure == []: depth = 1 else: depth = ja.get_depth() # ensure there is a regex for every level except the lowest if depth - len(expressions) != 1: raise AttributeError('Not enough data to parse. Need {} expressions, ' 'received {}'.format(depth - 1, len(expressions))) # compile regexes, instantiate index list regexes, indices = [re.compile(ex) for ex in expressions], [-1] * len(expressions) temp = [] # loop through file for line in infile: # check for matches to the regexes for i, reg in enumerate(regexes): if reg.search(line): # check that we've hit the first chapter and verse if indices.count(-1) == 0: ja.set_element(indices, cleaner(temp)) temp = [] if grab_all: temp.append(line) # increment index that's been hit, reset all subsequent indices indices[i] += 1 indices[i + 1:] = [0 for x in indices[i + 1:]] break else: if indices.count(-1) == 0: temp.append(line) else: ja.set_element(indices, cleaner(temp)) return ja
def test_ja_normalize(self): input_ja = ["a", [], ["", "a", ["c"]], ["", ""], ["b"]] output_ja = [[["a"]], [], [[], ["a"], ["c"]], [[], []], [["b"]]] jaobj = ja.JaggedArray(input_ja) jaobj.normalize() assert jaobj.array() == output_ja
def file_to_ja_g(depth, infile, expressions, cleaner, gimatria=False, group_name='gim', grab_all=[False] * 6): """ like file to ja but with changing the numbers to Gimatria Designed to be the first stage of a reusable parsing tool. Adds lines of text to the Jagged Array in the desired structure (Chapter, verse, etc.) :param depth: depth of the JaggedArray. :param infile: Text file to read from :param expressions: A list of regular expressions with which to identify section (chapter) level. Do not include an expression with which to break up the segment levels. :param cleaner: A function that takes a list of strings and returns an array with the text parsed correctly. Should also break up and remove unnecessary tagging data. :param grab_all: a boolean list accourding to the regexs, if True then grab all of that if False earse line the 5 is just above the 3 whitch is the deepst length we use for now. :param gimatria: if the text is presented with gimatria in it. :param group_name: a name given to the group of letters for the gimatria to actually use :return: A jagged_array with the text properly structured. """ # instantiate ja structure = reduce(lambda x, y: [x], range(depth - 1), []) ja = jagged_array.JaggedArray(structure) # ensure there is a regex for every level except the lowest if depth - len(expressions) != 1: raise AttributeError('Not enough data to parse. Need {} expressions, ' 'received {}'.format(depth - 1, len(expressions))) # compile regexes, instantiate index list regexes, indices = [re.compile(ex) for ex in expressions], [-1] * len(expressions) temp = [] # loop through file for line in infile: # check for matches to the regexes for i, reg in enumerate(regexes): found = reg.search(line) if found: if indices.count(-1) == 0: ja.set_element(indices, cleaner(temp), []) temp = [] if grab_all[i]: temp.append(line) # increment index that's been hit, reset all subsequent indices if gimatria: # note: if you uncomment the top must make this elif gimt = getGematria(found.group('{}'.format(group_name))) if gimt != 0: # increment index that's been hit, reset all subsequent indices indices[i] = gimt - 1 else: indices[i] += 1 else: indices[i] += 1 indices[i + 1:] = [-1 if x >= 0 else x for x in indices[i + 1:]] break else: if indices.count(-1) == 0: temp.append(line) else: ja.set_element(indices, cleaner(temp), []) return ja
sys.path.insert(0, p) from sources.local_settings import * sys.path.insert(0, SEFARIA_PROJECT_PATH) os.environ['DJANGO_SETTINGS_MODULE'] = "sefaria.settings" from sefaria.model import * from sources.functions import numToHeb, getGematria, post_index, post_text from sefaria.datatype import jagged_array from data_utilities.util import ja_to_xml, traverse_ja reload(sys) sys.setdefaultencoding("utf-8") simanim_ja = jagged_array.JaggedArray([[[]] ]) #JA of Simanim[Seifim[comments]]] def soupAndOpen(filename): with open(filename, "r") as file: page = file.read() return BeautifulSoup(page) def is_titled_seif(tag): return tag.has_attr('title') and u"סעיף" in tag['title'] def getSeifNumber(txt): assert u"סעיף" in txt seif_number_he = txt.split(' ')[1]