def get_from_db(word):
    ref_results = []
    conn = get_connection()

    # GET THE METADATA STUFF
    sql = 'SELECT * from {} WHERE _id like ?'.format(WORDS_2_REF_NUMS)
    _id, blob = conn.cursor().execute(sql, (METADATA_CHUNKS_PACKETSIZE,)).fetchall()[0]
    hex_str = make_little_endian(blob)

    chunk_size = int(hex_str[0:8], 16)
    PACKET_SIZE = int(hex_str[8:16], 16) # 3 * 8 # 3 bytes of bits * 8bits per byte
    print(_id, blob, hex_str, len(hex_str), chunk_size, PACKET_SIZE) # 200, 24 ... this looks correct
    ##

    sql = 'SELECT * from {} where _id like ?'.format(WORDS_2_REF_NUMS)
    cur = conn.cursor()
    cur.execute(sql, (word,))

    for word_id, blob in cur.fetchall():
        chunk_start_nums = []
        hex_str = str(blob).encode("hex")
        print(hex_str)
        for index in range(0, len(hex_str), 8): # each hex is half a byte and 4 bytes to a JH_packet
            packet = hex_str[index:index+8] # get just the packet
            packet_index = int(packet[:2], 16) # first byte is the packet_index
            # take last 3 bytes as the bitstring of 0 vs. 1 if contains `_id` keyword
            packet_bits = bin(int(packet[2:], 16))[2:].zfill(PACKET_SIZE)
            # for each of the bytes reverse the bits inside of it (make little endian)
            packet_bits = ''.join([packet_bits[i - 8:i][::-1] for i in range(8, len(packet_bits) + 1, 8)])
            for bit_index, bit in enumerate(packet_bits):
                if bit == '1':
                    chunk_start_num = (packet_index * PACKET_SIZE + bit_index) * chunk_size
                    chunk_start_nums.append(chunk_start_num)
                    print(index, packet, packet_index, packet_bits, chunk_start_num)

        title_id = -1
        for chunk_start_num in chunk_start_nums:
            # get all part names from chunk_start to chunk_start + size of the chunk
            sql = 'SELECT _rowid_, value from {} where _rowid_ BETWEEN ? AND ?'.format(REF_NUM_2_PART)
            cur.execute(sql, (chunk_start_num + 1, chunk_start_num + chunk_size + 1))

            for ref_row_id, part in cur.fetchall():
                ref_num = ref_row_id - 1
                if ref_num > title_id:
                    # get book title
                    sql = 'SELECT * from {} where _id <= ? order by `_id` desc limit 1'.format(REF_NUM_MIN_N_TITLE)
                    cur.execute(sql, (ref_num,))
                    rows = cur.fetchall()
                    title_id, title = rows[0]

                ref_str = '{}, {}'.format(title, part)
                r = Ref(ref_str)
                text = str(r.text('en').text)
                if word_id in text: # the word_id should appear as is, in at least one of the texts within the chunk

                    print('found word in', r)
                    ref_results.append(r)
    return ref_results
def get_from_word_2_ref(word, words_2_ref_nums, ref_num_min_N_title,
                        ref_num_2_part, ref_num_2_full_name):
    ref_nums = words_2_ref_nums.get(word, [])
    print(('ref_nums', ref_nums))
    ref_strs = []
    for ref_num in ref_nums:
        for ref_num_min, temp_title in ref_num_min_N_title:
            if ref_num_min > ref_num:
                break
            title = temp_title
        full_ref_str = '{}{}'.format(title, ref_num_2_part[ref_num])
        print(('full_ref_str', ref_num, full_ref_str))

        ref_strs.append(full_ref_str)
        continue
        #TODO: test weird part refs and make more complete and test that word always shows up in texts
        if full_ref_str != ref_num_2_full_name[ref_num]:
            print(('DIFF:', ))
            print(('try', full_ref_str))
            print(('real', ref_num_2_full_name[ref_num]))
        else:
            #print('same', full_ref_str)
            ref_strs.append(full_ref_str)

    for r in ref_strs:
        full_text = ' '.join(Ref(r).text('en').text)
        print(('text from ref search', r, word in full_text
               ))  # full_text.replace(word, '_____' + word + '_____'))

    return ref_strs
Пример #3
0
    def _single_lookup(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            input_word = strip_cantillation(input_word)
            """if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'"""
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        form = WordForm().load(query_obj)
        if not form and lookup_ref:
            del query_obj["refs"]
            form = WordForm().load(query_obj)
        if form:
            result = []
            headword_query = []
            for lookup in form.lookups:
                headword_query.append({'headword': lookup['headword']})
                # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms
            return headword_query
        else:
            return []
Пример #4
0
 def not_in_cache(self, title):
     self.assertFalse(title in library._index_map)
     self.assertTrue(title not in library.full_title_list())
     self.assertTrue(
         title not in json.loads(library.get_text_titles_json()))
     self.assertFalse(
         any(
             key.startswith(title)
             for key, value in Ref._raw_cache().iteritems()))
Пример #5
0
 def __init__(self, refRangeString, start, end, name):
     """
     hosts ref range and acceptable date parameters to help with determining whether a date/ref combination meets
     criteria for following a schedule
     :param start: datetime
     :param end: datetime
     """
     self.dateRange = DateRange(name, start, end)
     self.ref = Ref(refRangeString)
Пример #6
0
 def test_post_to_default_node(self):
     text = {
         "text": [["BFoo", "PBar", "Dub Blitz"],["GGGlam", "BBBlam", "Ber Flam"]],
         "versionTitle": "test_default_node",
         "versionSource": "www.sefaria.org",
         "language": "en",
     }
     response = c.post("/api/texts/Chofetz_Chaim,_Part_One,_The_Prohibition_Against_Lashon_Hara,_Principle_1", {'json': json.dumps(text)})
     self.assertEqual(200, response.status_code)
     data = json.loads(response.content)
     self.assertTrue("error" not in data)
     subref = Ref("Chofetz_Chaim,_Part_One,_The_Prohibition_Against_Lashon_Hara,_Principle_1.2.3")
     assert TextChunk(subref, "en", "test_default_node").text == "Ber Flam"
Пример #7
0
 def test_url_regex(self):
     if USE_VARNISH:
         assert v.url_regex(Ref("Exodus 15")) == r'Exodus(\\.15$|\\.15\\.)'
         assert v.url_regex(Ref("Exodus 15:15-17")) == r'Exodus(\\.15\\.15$|\\.15\\.15\\.|\\.15\\.16$|\\.15\\.16\\.|\\.15\\.17$|\\.15\\.17\\.)'
         assert v.url_regex(Ref("Yoma 14a")) == r'Yoma(\\.14a$|\\.14a\\.)'
         assert v.url_regex(Ref("Yoma 14a:12-15")) == r'Yoma(\\.14a\\.12$|\\.14a\\.12\\.|\\.14a\\.13$|\\.14a\\.13\\.|\\.14a\\.14$|\\.14a\\.14\\.|\\.14a\\.15$|\\.14a\\.15\\.)'
         assert v.url_regex(Ref("Yoma")) == r'Yoma($|\\.)'
         assert v.url_regex(Ref("Rashi on Genesis 1.1")) == r'Rashi\\_on\\_Genesis(\\.1\\.1$|\\.1\\.1\\.)'
Пример #8
0
    def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and len(forms) == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        return forms
Пример #9
0
    def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs):
        from sefaria.model import Ref

        lookup_ref = kwargs.get("lookup_ref", None)
        wform_pkey = lookup_key
        if is_hebrew(input_word):
            # This step technically used to happen in the lookup main method `lexicon_lookup` if there were no initial results, but in case where a
            # consonantal form was supplied in the first place, this optimizes queries.
            input_word = strip_cantillation(input_word)
            if not has_cantillation(input_word, detect_vowels=True):
                wform_pkey = 'c_form'
        query_obj = {wform_pkey: input_word}
        if lookup_ref:
            nref = Ref(lookup_ref).normal()
            query_obj["refs"] = {'$regex': '^{}'.format(nref)}
        forms = WordFormSet(query_obj)
        if lookup_ref and len(forms) == 0:
            del query_obj["refs"]
            forms = WordFormSet(query_obj)
        return forms
    def section_data(self, oref: model.Ref, default_versions: dict) -> dict:
        """
        :param oref: section level Ref instance
        :param default_versions: {'en': Version, 'he': Version}
        :param prev_next: tuple, with the oref before oref and after oref (or None if this is the first/last ref)
        Returns a dictionary with all the data we care about for section level `oref`.
        """
        prev, next_ref = oref.prev_section_ref(vstate=self.version_state),\
                         oref.next_section_ref(vstate=self.version_state)

        data = {
            "ref": oref.normal(),
            "heRef": oref.he_normal(),
            "indexTitle": oref.index.title,
            "heTitle": oref.index.get_title('he'),
            "sectionRef": oref.normal(),
            "next": next_ref.normal() if next_ref else None,
            "prev": prev.normal() if prev else None,
            "content": [],
        }

        def get_version_title(chunk):
            if not chunk.is_merged:
                version = chunk.version()
                if version and version.language in default_versions and version.versionTitle != default_versions[
                        version.language].versionTitle:
                    #print "VERSION NOT DEFAULT {} ({})".format(oref, chunk.lang)
                    try:
                        vnotes = version.versionNotes
                    except AttributeError:
                        vnotes = None
                    try:
                        vlicense = version.license
                    except AttributeError:
                        vlicense = None
                    try:
                        vsource = version.versionSource
                    except AttributeError:
                        vsource = None
                    try:
                        vnotesInHebrew = version.versionNotesInHebrew
                    except AttributeError:
                        vnotesInHebrew = None
                    try:
                        versionTitleInHebrew = version.versionTitleInHebrew
                    except AttributeError:
                        versionTitleInHebrew = None

                    return version.versionTitle, vnotes, vlicense, vsource, versionTitleInHebrew, vnotesInHebrew
                else:
                    return None, None, None, None, None, None  # default version
            else:
                #merged
                #print "MERGED SECTION {} ({})".format(oref, chunk.lang)
                all_versions = set(chunk.sources)
                merged_version = 'Merged from {}'.format(
                    ', '.join(all_versions))
                return merged_version, None, None, None, None, None

        node_title = oref.index_node.full_title()
        en_chunk, he_chunk = self._text_map[node_title][
            'en_chunk'], self._text_map[node_title]['en_chunk']
        en_vtitle, en_vnotes, en_vlicense, en_vsource, en_vtitle_he, en_vnotes_he = get_version_title(
            en_chunk)
        he_vtitle, he_vnotes, he_vlicense, he_vsource, he_vtitle_he, he_vnotes_he = get_version_title(
            he_chunk)

        if en_vtitle:
            data['versionTitle'] = en_vtitle
        if he_vtitle:
            data['heVersionTitle'] = he_vtitle
        if en_vnotes:
            data['versionNotes'] = en_vnotes
        if he_vnotes:
            data['heVersionNotes'] = he_vnotes
        if en_vlicense:
            data['license'] = en_vlicense
        if he_vlicense:
            data['heLicense'] = he_vlicense
        if en_vsource:
            data['versionSource'] = en_vsource
        if he_vsource:
            data['heVersionSource'] = he_vsource
        if en_vtitle_he:
            data['versionTitleInHebrew'] = en_vtitle_he
        if he_vtitle_he:
            data['heVersionTitleInHebrew'] = he_vtitle_he
        if en_vnotes_he:
            data['versionNotesInHebrew'] = en_vnotes_he
        if he_vnotes_he:
            data['heVersionNotesInHebrew'] = he_vnotes_he

        try:
            en_text = self._text_map[node_title]['en_ja'].get_element(
                [j - 1 for j in oref.sections])
        except IndexError:
            en_text = []
        try:
            he_text = self._text_map[node_title]['he_ja'].get_element(
                [j - 1 for j in oref.sections])
        except IndexError:
            he_text = []

        en_len = len(en_text)
        he_len = len(he_text)
        section_links = get_links(oref.normal(), False)
        anchor_ref_dict = defaultdict(list)
        for link in section_links:
            anchor_oref = model.Ref(link["anchorRef"])
            if not anchor_oref.is_segment_level() or len(
                    anchor_oref.sections) == 0:
                continue  # don't bother with section level links
            start_seg_num = anchor_oref.sections[-1]
            # make sure sections are the same in range
            # TODO doesn't deal with links that span sections
            end_seg_num = anchor_oref.toSections[-1] if anchor_oref.sections[
                0] == anchor_oref.toSections[0] else max(en_len, he_len)
            for x in range(start_seg_num, end_seg_num + 1):
                anchor_ref_dict[x] += [simple_link(link)]
        for x in range(0, max(en_len, he_len)):
            curContent = {}
            curContent["segmentNumber"] = str(x + 1)
            links = anchor_ref_dict[x + 1]
            if len(links) > 0:
                curContent["links"] = links

            if x < en_len:
                curContent["text"] = en_text[x]
            if x < he_len:
                curContent["he"] = he_text[x]

            data["content"] += [curContent]

        return data
Пример #11
0
    def test_post_new_text(self):
        """
        Tests:
            post of index & that new index is in index/titles
            post and get of English text
            post and get of Hebrew text
            Verify that in-text ref is caught and made a link
            Verify that changing of in-text ref results in old link removed and new one added
            counts docs of both he and en
            index delete and its cascading
        """
        # Post a new Index
        index = {
            "title": "Sefer Test",
            "titleVariants": ["The Book of Test"],
            "sectionNames": ["Chapter", "Paragraph"],
            "categories": ["Musar"],
        }
        response = c.post("/api/index/Sefer_Test", {'json': json.dumps(index)})
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertIn("titleVariants", data)
        self.assertIn(u'Sefer Test', data["titleVariants"])

        response = c.get("/api/index/titles")
        data = json.loads(response.content)
        self.assertIn(u'Sefer Test', data["books"])

        #test the toc is updated
        toc = json.loads(c.get("/api/index").content)
        tutils.verify_title_existence_in_toc(index['title'],
                                             index['categories'])

        # Post Text (with English citation)
        text = {
            "text": "As it is written in Job 3:14, waste places.",
            "versionTitle": "The Test Edition",
            "versionSource": "www.sefaria.org",
            "language": "en",
        }
        response = c.post("/api/texts/Sefer_Test.99.99",
                          {'json': json.dumps(text)})
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertTrue("error" not in data)
        # Verify one link was auto extracted
        response = c.get('/api/texts/Sefer_Test.99.99')
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertEqual(1, len(data["commentary"]))
        # Verify Count doc was updated
        response = c.get('/api/counts/Sefer_Test')
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertNotIn("error", data)
        self.assertEqual([1, 1], data["_en"]["availableCounts"])
        self.assertEqual(1, data["_en"]["availableTexts"][98][98])
        self.assertEqual(0, data["_en"]["availableTexts"][98][55])

        # Update link in the text
        text = {
            "text":
            "As it is written in Job 4:10, The lions may roar and growl.",
            "versionTitle": "The Test Edition",
            "versionSource": "www.sefaria.org",
            "language": "en",
        }
        response = c.post("/api/texts/Sefer_Test.99.99",
                          {'json': json.dumps(text)})
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertTrue("error" not in data)
        # Verify one link was auto extracted
        response = c.get('/api/texts/Sefer_Test.99.99')
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertEqual(1, len(data["commentary"]))
        self.assertEqual(data["commentary"][0]["ref"], 'Job 4:10')

        # Post Text (with Hebrew citation)
        text = {
            "text": 'כדכתיב: "לא תעשה לך פסל כל תמונה" כו (דברים ה ח)',
            "versionTitle": "The Hebrew Test Edition",
            "versionSource": "www.sefaria.org",
            "language": "he",
        }
        response = c.post("/api/texts/Sefer_Test.88.88",
                          {'json': json.dumps(text)})
        self.assertEqual(200, response.status_code)
        # Verify one link was auto extracted
        response = c.get('/api/texts/Sefer_Test.88.88')
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertEqual(1, len(data["commentary"]))
        # Verify count doc was updated
        response = c.get('/api/counts/Sefer_Test')
        self.assertEqual(200, response.status_code)
        data = json.loads(response.content)
        self.assertEqual([1, 1], data["_he"]["availableCounts"])
        self.assertEqual(1, data["_he"]["availableTexts"][87][87])
        self.assertEqual(0, data["_en"]["availableTexts"][87][87])

        # Delete Test Index
        textRegex = Ref('Sefer Test').regex()
        IndexSet({"title": u'Sefer Test'}).delete()

        #Make sure that index was deleted, and that delete cascaded to: versions, counts, links, cache,
        #todo: notes?, reviews?
        self.assertEqual(0, IndexSet({"title": u'Sefer Test'}).count())
        self.assertEqual(0, VersionSet({"title": u'Sefer Test'}).count())
        self.assertEqual(0, VersionStateSet({"title": u'Sefer Test'}).count())
        #todo: better way to do this?
        self.assertEqual(0, LinkSet({"refs": {"$regex": textRegex}}).count())
def get_from_db(word):
    ref_results = []
    conn = get_connection()

    # GET THE METADATA STUFF
    sql = 'SELECT * from {} WHERE _id like ?'.format(WORDS_2_REF_NUMS)
    _id, blob = conn.cursor().execute(
        sql, (METADATA_CHUNKS_PACKETSIZE, )).fetchall()[0]
    hex_str = make_little_endian(blob)

    chunk_size = int(hex_str[0:8], 16)
    PACKET_SIZE = int(hex_str[8:16],
                      16)  # 3 * 8 # 3 bytes of bits * 8bits per byte
    print((_id, blob, hex_str, len(hex_str), chunk_size,
           PACKET_SIZE))  # 200, 24 ... this looks correct
    ##

    sql = 'SELECT * from {} where _id like ?'.format(WORDS_2_REF_NUMS)
    cur = conn.cursor()
    cur.execute(sql, (word, ))

    for word_id, blob in cur.fetchall():
        chunk_start_nums = []
        hex_str = str(blob).encode("hex")
        print(hex_str)
        for index in range(
                0, len(hex_str),
                8):  # each hex is half a byte and 4 bytes to a JH_packet
            packet = hex_str[index:index + 8]  # get just the packet
            packet_index = int(packet[:2],
                               16)  # first byte is the packet_index
            # take last 3 bytes as the bitstring of 0 vs. 1 if contains `_id` keyword
            packet_bits = bin(int(packet[2:], 16))[2:].zfill(PACKET_SIZE)
            # for each of the bytes reverse the bits inside of it (make little endian)
            packet_bits = ''.join([
                packet_bits[i - 8:i][::-1]
                for i in range(8,
                               len(packet_bits) + 1, 8)
            ])
            for bit_index, bit in enumerate(packet_bits):
                if bit == '1':
                    chunk_start_num = (packet_index * PACKET_SIZE +
                                       bit_index) * chunk_size
                    chunk_start_nums.append(chunk_start_num)
                    print((index, packet, packet_index, packet_bits,
                           chunk_start_num))

        title_id = -1
        for chunk_start_num in chunk_start_nums:
            # get all part names from chunk_start to chunk_start + size of the chunk
            sql = 'SELECT _rowid_, value from {} where _rowid_ BETWEEN ? AND ?'.format(
                REF_NUM_2_PART)
            cur.execute(
                sql, (chunk_start_num + 1, chunk_start_num + chunk_size + 1))

            for ref_row_id, part in cur.fetchall():
                ref_num = ref_row_id - 1
                if ref_num > title_id:
                    # get book title
                    sql = 'SELECT * from {} where _id <= ? order by `_id` desc limit 1'.format(
                        REF_NUM_MIN_N_TITLE)
                    cur.execute(sql, (ref_num, ))
                    rows = cur.fetchall()
                    title_id, title = rows[0]

                ref_str = '{}{}'.format(title, part)
                try:
                    r = Ref(ref_str)
                    result = search_in_ref(r, word)
                    if result:
                        ref_results.append(r)
                        # print result
                except InputError as e:
                    print(('ERROR parsing ref', e))
    print(('Found {} results for {}'.format(len(ref_results), word)))
    return ref_results
Пример #13
0
 def not_in_cache(self, title):
     self.assertFalse(title in library._index_map)
     self.assertTrue(title not in library.full_title_list())
     self.assertTrue(title not in json.loads(library.get_text_titles_json()))
     self.assertFalse(any(key.startswith(title) for key, value in Ref._raw_cache().iteritems()))
Пример #14
0
 def not_in_cache(self, title):
     self.assertFalse(any(key.startswith(title) for key, value in scache.index_cache.iteritems()))
     self.assertTrue(title not in get_text_titles())
     self.assertTrue(title not in json.loads(get_text_titles_json()))
     self.assertFalse(any(key.startswith(title) for key, value in Ref._raw_cache().iteritems()))