def get_from_db(word): ref_results = [] conn = get_connection() # GET THE METADATA STUFF sql = 'SELECT * from {} WHERE _id like ?'.format(WORDS_2_REF_NUMS) _id, blob = conn.cursor().execute(sql, (METADATA_CHUNKS_PACKETSIZE,)).fetchall()[0] hex_str = make_little_endian(blob) chunk_size = int(hex_str[0:8], 16) PACKET_SIZE = int(hex_str[8:16], 16) # 3 * 8 # 3 bytes of bits * 8bits per byte print(_id, blob, hex_str, len(hex_str), chunk_size, PACKET_SIZE) # 200, 24 ... this looks correct ## sql = 'SELECT * from {} where _id like ?'.format(WORDS_2_REF_NUMS) cur = conn.cursor() cur.execute(sql, (word,)) for word_id, blob in cur.fetchall(): chunk_start_nums = [] hex_str = str(blob).encode("hex") print(hex_str) for index in range(0, len(hex_str), 8): # each hex is half a byte and 4 bytes to a JH_packet packet = hex_str[index:index+8] # get just the packet packet_index = int(packet[:2], 16) # first byte is the packet_index # take last 3 bytes as the bitstring of 0 vs. 1 if contains `_id` keyword packet_bits = bin(int(packet[2:], 16))[2:].zfill(PACKET_SIZE) # for each of the bytes reverse the bits inside of it (make little endian) packet_bits = ''.join([packet_bits[i - 8:i][::-1] for i in range(8, len(packet_bits) + 1, 8)]) for bit_index, bit in enumerate(packet_bits): if bit == '1': chunk_start_num = (packet_index * PACKET_SIZE + bit_index) * chunk_size chunk_start_nums.append(chunk_start_num) print(index, packet, packet_index, packet_bits, chunk_start_num) title_id = -1 for chunk_start_num in chunk_start_nums: # get all part names from chunk_start to chunk_start + size of the chunk sql = 'SELECT _rowid_, value from {} where _rowid_ BETWEEN ? AND ?'.format(REF_NUM_2_PART) cur.execute(sql, (chunk_start_num + 1, chunk_start_num + chunk_size + 1)) for ref_row_id, part in cur.fetchall(): ref_num = ref_row_id - 1 if ref_num > title_id: # get book title sql = 'SELECT * from {} where _id <= ? order by `_id` desc limit 1'.format(REF_NUM_MIN_N_TITLE) cur.execute(sql, (ref_num,)) rows = cur.fetchall() title_id, title = rows[0] ref_str = '{}, {}'.format(title, part) r = Ref(ref_str) text = str(r.text('en').text) if word_id in text: # the word_id should appear as is, in at least one of the texts within the chunk print('found word in', r) ref_results.append(r) return ref_results
def get_from_word_2_ref(word, words_2_ref_nums, ref_num_min_N_title, ref_num_2_part, ref_num_2_full_name): ref_nums = words_2_ref_nums.get(word, []) print(('ref_nums', ref_nums)) ref_strs = [] for ref_num in ref_nums: for ref_num_min, temp_title in ref_num_min_N_title: if ref_num_min > ref_num: break title = temp_title full_ref_str = '{}{}'.format(title, ref_num_2_part[ref_num]) print(('full_ref_str', ref_num, full_ref_str)) ref_strs.append(full_ref_str) continue #TODO: test weird part refs and make more complete and test that word always shows up in texts if full_ref_str != ref_num_2_full_name[ref_num]: print(('DIFF:', )) print(('try', full_ref_str)) print(('real', ref_num_2_full_name[ref_num])) else: #print('same', full_ref_str) ref_strs.append(full_ref_str) for r in ref_strs: full_text = ' '.join(Ref(r).text('en').text) print(('text from ref search', r, word in full_text )) # full_text.replace(word, '_____' + word + '_____')) return ref_strs
def _single_lookup(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) """if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form'""" query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} form = WordForm().load(query_obj) if not form and lookup_ref: del query_obj["refs"] form = WordForm().load(query_obj) if form: result = [] headword_query = [] for lookup in form.lookups: headword_query.append({'headword': lookup['headword']}) # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms return headword_query else: return []
def not_in_cache(self, title): self.assertFalse(title in library._index_map) self.assertTrue(title not in library.full_title_list()) self.assertTrue( title not in json.loads(library.get_text_titles_json())) self.assertFalse( any( key.startswith(title) for key, value in Ref._raw_cache().iteritems()))
def __init__(self, refRangeString, start, end, name): """ hosts ref range and acceptable date parameters to help with determining whether a date/ref combination meets criteria for following a schedule :param start: datetime :param end: datetime """ self.dateRange = DateRange(name, start, end) self.ref = Ref(refRangeString)
def test_post_to_default_node(self): text = { "text": [["BFoo", "PBar", "Dub Blitz"],["GGGlam", "BBBlam", "Ber Flam"]], "versionTitle": "test_default_node", "versionSource": "www.sefaria.org", "language": "en", } response = c.post("/api/texts/Chofetz_Chaim,_Part_One,_The_Prohibition_Against_Lashon_Hara,_Principle_1", {'json': json.dumps(text)}) self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertTrue("error" not in data) subref = Ref("Chofetz_Chaim,_Part_One,_The_Prohibition_Against_Lashon_Hara,_Principle_1.2.3") assert TextChunk(subref, "en", "test_default_node").text == "Ber Flam"
def test_url_regex(self): if USE_VARNISH: assert v.url_regex(Ref("Exodus 15")) == r'Exodus(\\.15$|\\.15\\.)' assert v.url_regex(Ref("Exodus 15:15-17")) == r'Exodus(\\.15\\.15$|\\.15\\.15\\.|\\.15\\.16$|\\.15\\.16\\.|\\.15\\.17$|\\.15\\.17\\.)' assert v.url_regex(Ref("Yoma 14a")) == r'Yoma(\\.14a$|\\.14a\\.)' assert v.url_regex(Ref("Yoma 14a:12-15")) == r'Yoma(\\.14a\\.12$|\\.14a\\.12\\.|\\.14a\\.13$|\\.14a\\.13\\.|\\.14a\\.14$|\\.14a\\.14\\.|\\.14a\\.15$|\\.14a\\.15\\.)' assert v.url_regex(Ref("Yoma")) == r'Yoma($|\\.)' assert v.url_regex(Ref("Rashi on Genesis 1.1")) == r'Rashi\\_on\\_Genesis(\\.1\\.1$|\\.1\\.1\\.)'
def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) return forms
def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs): from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): # This step technically used to happen in the lookup main method `lexicon_lookup` if there were no initial results, but in case where a # consonantal form was supplied in the first place, this optimizes queries. input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) return forms
def section_data(self, oref: model.Ref, default_versions: dict) -> dict: """ :param oref: section level Ref instance :param default_versions: {'en': Version, 'he': Version} :param prev_next: tuple, with the oref before oref and after oref (or None if this is the first/last ref) Returns a dictionary with all the data we care about for section level `oref`. """ prev, next_ref = oref.prev_section_ref(vstate=self.version_state),\ oref.next_section_ref(vstate=self.version_state) data = { "ref": oref.normal(), "heRef": oref.he_normal(), "indexTitle": oref.index.title, "heTitle": oref.index.get_title('he'), "sectionRef": oref.normal(), "next": next_ref.normal() if next_ref else None, "prev": prev.normal() if prev else None, "content": [], } def get_version_title(chunk): if not chunk.is_merged: version = chunk.version() if version and version.language in default_versions and version.versionTitle != default_versions[ version.language].versionTitle: #print "VERSION NOT DEFAULT {} ({})".format(oref, chunk.lang) try: vnotes = version.versionNotes except AttributeError: vnotes = None try: vlicense = version.license except AttributeError: vlicense = None try: vsource = version.versionSource except AttributeError: vsource = None try: vnotesInHebrew = version.versionNotesInHebrew except AttributeError: vnotesInHebrew = None try: versionTitleInHebrew = version.versionTitleInHebrew except AttributeError: versionTitleInHebrew = None return version.versionTitle, vnotes, vlicense, vsource, versionTitleInHebrew, vnotesInHebrew else: return None, None, None, None, None, None # default version else: #merged #print "MERGED SECTION {} ({})".format(oref, chunk.lang) all_versions = set(chunk.sources) merged_version = 'Merged from {}'.format( ', '.join(all_versions)) return merged_version, None, None, None, None, None node_title = oref.index_node.full_title() en_chunk, he_chunk = self._text_map[node_title][ 'en_chunk'], self._text_map[node_title]['en_chunk'] en_vtitle, en_vnotes, en_vlicense, en_vsource, en_vtitle_he, en_vnotes_he = get_version_title( en_chunk) he_vtitle, he_vnotes, he_vlicense, he_vsource, he_vtitle_he, he_vnotes_he = get_version_title( he_chunk) if en_vtitle: data['versionTitle'] = en_vtitle if he_vtitle: data['heVersionTitle'] = he_vtitle if en_vnotes: data['versionNotes'] = en_vnotes if he_vnotes: data['heVersionNotes'] = he_vnotes if en_vlicense: data['license'] = en_vlicense if he_vlicense: data['heLicense'] = he_vlicense if en_vsource: data['versionSource'] = en_vsource if he_vsource: data['heVersionSource'] = he_vsource if en_vtitle_he: data['versionTitleInHebrew'] = en_vtitle_he if he_vtitle_he: data['heVersionTitleInHebrew'] = he_vtitle_he if en_vnotes_he: data['versionNotesInHebrew'] = en_vnotes_he if he_vnotes_he: data['heVersionNotesInHebrew'] = he_vnotes_he try: en_text = self._text_map[node_title]['en_ja'].get_element( [j - 1 for j in oref.sections]) except IndexError: en_text = [] try: he_text = self._text_map[node_title]['he_ja'].get_element( [j - 1 for j in oref.sections]) except IndexError: he_text = [] en_len = len(en_text) he_len = len(he_text) section_links = get_links(oref.normal(), False) anchor_ref_dict = defaultdict(list) for link in section_links: anchor_oref = model.Ref(link["anchorRef"]) if not anchor_oref.is_segment_level() or len( anchor_oref.sections) == 0: continue # don't bother with section level links start_seg_num = anchor_oref.sections[-1] # make sure sections are the same in range # TODO doesn't deal with links that span sections end_seg_num = anchor_oref.toSections[-1] if anchor_oref.sections[ 0] == anchor_oref.toSections[0] else max(en_len, he_len) for x in range(start_seg_num, end_seg_num + 1): anchor_ref_dict[x] += [simple_link(link)] for x in range(0, max(en_len, he_len)): curContent = {} curContent["segmentNumber"] = str(x + 1) links = anchor_ref_dict[x + 1] if len(links) > 0: curContent["links"] = links if x < en_len: curContent["text"] = en_text[x] if x < he_len: curContent["he"] = he_text[x] data["content"] += [curContent] return data
def test_post_new_text(self): """ Tests: post of index & that new index is in index/titles post and get of English text post and get of Hebrew text Verify that in-text ref is caught and made a link Verify that changing of in-text ref results in old link removed and new one added counts docs of both he and en index delete and its cascading """ # Post a new Index index = { "title": "Sefer Test", "titleVariants": ["The Book of Test"], "sectionNames": ["Chapter", "Paragraph"], "categories": ["Musar"], } response = c.post("/api/index/Sefer_Test", {'json': json.dumps(index)}) self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertIn("titleVariants", data) self.assertIn(u'Sefer Test', data["titleVariants"]) response = c.get("/api/index/titles") data = json.loads(response.content) self.assertIn(u'Sefer Test', data["books"]) #test the toc is updated toc = json.loads(c.get("/api/index").content) tutils.verify_title_existence_in_toc(index['title'], index['categories']) # Post Text (with English citation) text = { "text": "As it is written in Job 3:14, waste places.", "versionTitle": "The Test Edition", "versionSource": "www.sefaria.org", "language": "en", } response = c.post("/api/texts/Sefer_Test.99.99", {'json': json.dumps(text)}) self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertTrue("error" not in data) # Verify one link was auto extracted response = c.get('/api/texts/Sefer_Test.99.99') self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertEqual(1, len(data["commentary"])) # Verify Count doc was updated response = c.get('/api/counts/Sefer_Test') self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertNotIn("error", data) self.assertEqual([1, 1], data["_en"]["availableCounts"]) self.assertEqual(1, data["_en"]["availableTexts"][98][98]) self.assertEqual(0, data["_en"]["availableTexts"][98][55]) # Update link in the text text = { "text": "As it is written in Job 4:10, The lions may roar and growl.", "versionTitle": "The Test Edition", "versionSource": "www.sefaria.org", "language": "en", } response = c.post("/api/texts/Sefer_Test.99.99", {'json': json.dumps(text)}) self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertTrue("error" not in data) # Verify one link was auto extracted response = c.get('/api/texts/Sefer_Test.99.99') self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertEqual(1, len(data["commentary"])) self.assertEqual(data["commentary"][0]["ref"], 'Job 4:10') # Post Text (with Hebrew citation) text = { "text": 'כדכתיב: "לא תעשה לך פסל כל תמונה" כו (דברים ה ח)', "versionTitle": "The Hebrew Test Edition", "versionSource": "www.sefaria.org", "language": "he", } response = c.post("/api/texts/Sefer_Test.88.88", {'json': json.dumps(text)}) self.assertEqual(200, response.status_code) # Verify one link was auto extracted response = c.get('/api/texts/Sefer_Test.88.88') self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertEqual(1, len(data["commentary"])) # Verify count doc was updated response = c.get('/api/counts/Sefer_Test') self.assertEqual(200, response.status_code) data = json.loads(response.content) self.assertEqual([1, 1], data["_he"]["availableCounts"]) self.assertEqual(1, data["_he"]["availableTexts"][87][87]) self.assertEqual(0, data["_en"]["availableTexts"][87][87]) # Delete Test Index textRegex = Ref('Sefer Test').regex() IndexSet({"title": u'Sefer Test'}).delete() #Make sure that index was deleted, and that delete cascaded to: versions, counts, links, cache, #todo: notes?, reviews? self.assertEqual(0, IndexSet({"title": u'Sefer Test'}).count()) self.assertEqual(0, VersionSet({"title": u'Sefer Test'}).count()) self.assertEqual(0, VersionStateSet({"title": u'Sefer Test'}).count()) #todo: better way to do this? self.assertEqual(0, LinkSet({"refs": {"$regex": textRegex}}).count())
def get_from_db(word): ref_results = [] conn = get_connection() # GET THE METADATA STUFF sql = 'SELECT * from {} WHERE _id like ?'.format(WORDS_2_REF_NUMS) _id, blob = conn.cursor().execute( sql, (METADATA_CHUNKS_PACKETSIZE, )).fetchall()[0] hex_str = make_little_endian(blob) chunk_size = int(hex_str[0:8], 16) PACKET_SIZE = int(hex_str[8:16], 16) # 3 * 8 # 3 bytes of bits * 8bits per byte print((_id, blob, hex_str, len(hex_str), chunk_size, PACKET_SIZE)) # 200, 24 ... this looks correct ## sql = 'SELECT * from {} where _id like ?'.format(WORDS_2_REF_NUMS) cur = conn.cursor() cur.execute(sql, (word, )) for word_id, blob in cur.fetchall(): chunk_start_nums = [] hex_str = str(blob).encode("hex") print(hex_str) for index in range( 0, len(hex_str), 8): # each hex is half a byte and 4 bytes to a JH_packet packet = hex_str[index:index + 8] # get just the packet packet_index = int(packet[:2], 16) # first byte is the packet_index # take last 3 bytes as the bitstring of 0 vs. 1 if contains `_id` keyword packet_bits = bin(int(packet[2:], 16))[2:].zfill(PACKET_SIZE) # for each of the bytes reverse the bits inside of it (make little endian) packet_bits = ''.join([ packet_bits[i - 8:i][::-1] for i in range(8, len(packet_bits) + 1, 8) ]) for bit_index, bit in enumerate(packet_bits): if bit == '1': chunk_start_num = (packet_index * PACKET_SIZE + bit_index) * chunk_size chunk_start_nums.append(chunk_start_num) print((index, packet, packet_index, packet_bits, chunk_start_num)) title_id = -1 for chunk_start_num in chunk_start_nums: # get all part names from chunk_start to chunk_start + size of the chunk sql = 'SELECT _rowid_, value from {} where _rowid_ BETWEEN ? AND ?'.format( REF_NUM_2_PART) cur.execute( sql, (chunk_start_num + 1, chunk_start_num + chunk_size + 1)) for ref_row_id, part in cur.fetchall(): ref_num = ref_row_id - 1 if ref_num > title_id: # get book title sql = 'SELECT * from {} where _id <= ? order by `_id` desc limit 1'.format( REF_NUM_MIN_N_TITLE) cur.execute(sql, (ref_num, )) rows = cur.fetchall() title_id, title = rows[0] ref_str = '{}{}'.format(title, part) try: r = Ref(ref_str) result = search_in_ref(r, word) if result: ref_results.append(r) # print result except InputError as e: print(('ERROR parsing ref', e)) print(('Found {} results for {}'.format(len(ref_results), word))) return ref_results
def not_in_cache(self, title): self.assertFalse(title in library._index_map) self.assertTrue(title not in library.full_title_list()) self.assertTrue(title not in json.loads(library.get_text_titles_json())) self.assertFalse(any(key.startswith(title) for key, value in Ref._raw_cache().iteritems()))
def not_in_cache(self, title): self.assertFalse(any(key.startswith(title) for key, value in scache.index_cache.iteritems())) self.assertTrue(title not in get_text_titles()) self.assertTrue(title not in json.loads(get_text_titles_json())) self.assertFalse(any(key.startswith(title) for key, value in Ref._raw_cache().iteritems()))