def get_refs(self, node): assert isinstance(node, SchemaNode) if node.is_leaf(): nodes = [node] else: nodes = node.get_leaf_nodes() for leaf_node in nodes: assert isinstance(leaf_node, JaggedArrayNode) if leaf_node.sharedTitle is not None: term = Term().load({'name': leaf_node.sharedTitle}) self.base_refs.append(Ref(term.ref)) self.mei_refs.append(leaf_node.ref()) else: for subref in leaf_node.ref().all_subrefs(): assert isinstance(subref, Ref) if not subref.is_section_level( ): # Don't bother trying to match depth 1 texts break if subref.is_empty(): continue else: base_book = leaf_node.primary_title('en') base_chapter = subref.sections[0] self.base_refs.append( Ref("{} {}".format(base_book, base_chapter))) self.mei_refs.append(subref)
def _normalize(self): # Derived values - used to make downstream queries quicker self.datetime = datetime.utcfromtimestamp(self.time_stamp) try: r = Ref(self.ref) self.context_refs = [r.normal() for r in r.all_context_refs()] self.categories = r.index.categories self.authors = getattr(r.index, "authors", []) self.is_sheet = r.index.title == "Sheet" if self.is_sheet: self.sheet_id = r.sections[0] if not self.secondary and not self.is_sheet and getattr(self, "language", None) != "hebrew" and r.is_empty("en"): # logically, this would be on frontend, but easier here. self.language = "hebrew" except SheetNotFoundError: self.context_refs = [self.ref] self.categories = ["_unlisted"] self.authors = [] self.is_sheet = True except InputError: # Ref failed to resolve self.context_refs = [self.ref] self.categories = [] self.authors = [] self.is_sheet = False except KeyError: # is_text_translated() stumbled on a bad version state pass
def test_commentary(self): s = "Here's one with Rashi on Genesis 2:5:3" s2 = "Here's one with both Rashi on Genesis 3:4 and Exodus 5:2. yeah" s3 = "Here's one with Genesis 2:3" assert library.get_refs_in_string(s, "en") == [Ref("Rashi on Genesis 2:5:3")] assert library.get_refs_in_string(s2, "en") == [Ref("Rashi on Genesis 3:4"), Ref("Exodus 5:2")] assert library.get_refs_in_string(s3, "en") == [Ref("Genesis 2:3")]
def test_two_single_quotes(self): ref = library.get_refs_in_string(u"עין ממש דכתיב (במדבר ל''ה) ולא תקחו") assert 1 == len(ref) assert ref[0] == Ref(u"במדבר ל''ה") ref = library.get_refs_in_string(u"דאמר קרא (שופטים כ י''א) ויאסף כל איש") assert 1 == len(ref) assert ref[0] == Ref(u"שופטים כ י''א")
def set_recent_item(tref): """ Save `tref` as a recently viewed text at the front of the list. Removes any previous location for that text. Not used yet, need to consider if it's better to store derivable information (ref->heRef) or reprocess it often. """ oref = Ref(tref) recent = [tref for tref in self.recent if Ref(tref).index.title != oref.index.title] self.recent = [tref] + recent self.save()
def test_sefer_mitzvot(self): ref = library.get_refs_in_string(texts['neg327']) assert 4 == len(ref) assert { Ref(u'ויקרא טז,כט'), Ref(u'ויקרא כג,כח'), Ref(u'ויקרא כג,לא'), Ref(u'במדבר כט,ז') } == set(ref)
def set_version_by_category(self, book_name): book_ref = Ref(book_name) if book_ref.is_tanach(): self.version_map[book_name] = 'Tanach with Text Only' elif book_ref.is_bavli(): self.version_map[book_name] = 'Wikisource Talmud Bavli' else: raise AttributeError( '{} does not match a default category, use set_version_by_book instead' .format(book_name))
def process_index_title_change_in_notes(indx, **kwargs): print "Cascading Notes {} to {}".format(kwargs['old'], kwargs['new']) pattern = Ref(indx.title).regex() pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"])) notes = NoteSet({"ref": {"$regex": pattern}}) for n in notes: try: n.ref = n.ref.replace(kwargs["old"], kwargs["new"], 1) n.save() except Exception: logger.warning("Deleting note that failed to save: {}".format(n.ref)) n.delete()
def process_index_title_change_in_notes(indx, **kwargs): print "Cascading Notes {} to {}".format(kwargs['old'], kwargs['new']) pattern = Ref(indx.title).regex() pattern = pattern.replace(re.escape(indx.title), re.escape(kwargs["old"])) notes = NoteSet({"ref": {"$regex": pattern}}) for n in notes: try: n.ref = n.ref.replace(kwargs["old"], kwargs["new"], 1) n.save() except Exception: logger.warning("Deleting note that failed to save: {}".format( n.ref)) n.delete()
def check_chapters(): cards = get_cards() good_files, bad_files = [], [] for card in cards: m_ref = Ref(card.replace('Rambam ', '')) with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile: tester = TagTester(u'@00', infile, u'@00\u05e4\u05e8\u05e7') tags = tester.grab_each_header() if len(tags) == len( m_ref.all_subrefs()) or card == 'Rambam Pirkei Avot': good_files.append(card) else: bad_files.append(card) return {'good': good_files, 'bad': bad_files}
def test_add_expanded_refs(self, topic_graph): attrs = { 'ref': 'Genesis 1:1', 'toTopic': '6', 'linkType': 'about', 'dataSource': 'sefaria' } l = RefTopicLink(attrs) l.save() assert getattr(l, 'class') == 'refTopic' assert l.expandedRefs == ['Genesis 1:1'] l.delete() attrs = { 'ref': 'Genesis 1:1-3', 'toTopic': '6', 'linkType': 'about', 'dataSource': 'sefaria' } l = RefTopicLink(attrs) l.save() assert l.expandedRefs == ['Genesis 1:1', 'Genesis 1:2', 'Genesis 1:3'] l.delete() attrs = { 'ref': 'Genesis 1-2', 'toTopic': '6', 'linkType': 'about', 'dataSource': 'sefaria' } l = RefTopicLink(attrs) l.save() test_refs = [r.normal() for r in Ref('Genesis 1-2').all_segment_refs()] assert l.expandedRefs == test_refs l.delete()
def test_inner_parenthesis(self): ref = library.get_refs_in_string(u"Bereishit Rabbah (55:7)", "en") assert 1 == len(ref) assert ref[0] == Ref(u'Bereshit Rabbah 55:7') ''' Ranges not yet supported
def check_chapters(): cards = get_cards() good_files, bad_files = [], [] for card in cards: m_ref = Ref(card.replace('Rambam ', '')) with codecs.open('{}.txt'.format(card), 'r', 'utf-8') as infile: tester = TagTester(u'@00', infile, u'@00\u05e4\u05e8\u05e7') tags = tester.grab_each_header() if len(tags) == len(m_ref.all_subrefs()) or card == 'Rambam Pirkei Avot': good_files.append(card) else: bad_files.append(card) return { 'good': good_files, 'bad': bad_files }
def process_index_title_change_in_manuscript_links(indx, **kwargs): from sefaria.system.exceptions import InputError print("Cascading ManuscriptPage from {} to {}".format( kwargs['old'], kwargs['new'])) # ensure that the regex library we're using here is the same regex library being used in `Ref.regex` from .text import re as reg_reg patterns = [ pattern.replace(reg_reg.escape(indx.title), reg_reg.escape(kwargs["old"])) for pattern in Ref(indx.title).regex(as_list=True) ] queries = [{'expanded_refs': {'$regex': pattern}} for pattern in patterns] objs = ManuscriptPageSet({"$or": queries}) for o in objs: o.contained_refs = [ r.replace(kwargs["old"], kwargs["new"], 1) if reg_reg.search( '|'.join(patterns), r) else r for r in o.contained_refs ] o.expanded_refs = [ r.replace(kwargs["old"], kwargs["new"], 1) if reg_reg.search( '|'.join(patterns), r) else r for r in o.expanded_refs ] try: o.save() except InputError: logger.warning("Failed to convert ref data from: {} to {}".format( kwargs['old'], kwargs['new']))
def _normalize(self): # Derived values - used to make downstream queries quicker self.datetime = datetime.utcfromtimestamp(self.time_stamp) try: r = Ref(self.ref) self.context_refs = [r.normal() for r in r.all_context_refs()] self.categories = r.index.categories self.authors = getattr(r.index, "authors", []) self.is_sheet = r.index.title == "Sheet" if self.is_sheet: self.sheet_id = r.sections[0] except InputError: # Ref failed to resolve self.context_refs = [self.ref] self.categories = [] self.authors = [] self.is_sheet = False
def topic_graph_to_merge(): isa_links = [ (10, 20), (20, 30), (20, 40), (40, 50), (60, 50), ] trefs = [r.normal() for r in Ref('Genesis 1:1-10').range_list()] trefs1 = [r.normal() for r in Ref('Exodus 1:1-10').range_list()] trefs2 = [r.normal() for r in Ref('Leviticus 1:1-10').range_list()] graph = { 'topics': {str(i): make_topic(str(i)) for i in range(10, 100, 10)}, 'links': [make_it_link(str(a), str(b), 'is-a') for a, b in isa_links] + [make_rt_link('10', r) for r in trefs] + [make_rt_link('20', r) for r in trefs1] + [make_rt_link('40', r) for r in trefs2] } db.sheets.insert_one({ "id": 1234567890, "topics": [{ "slug": '20', 'asTyped': 'twenty' }, { "slug": '40', 'asTyped': '4d' }, { "slug": '20', 'asTyped': 'twent-e' }, { "slug": '30', 'asTyped': 'thirty' }] }) yield graph for k, v in graph['topics'].items(): v.delete() for v in graph['links']: v.delete() db.sheets.delete_one({"id": 1234567890})
def save_history_item(cls, uid, hist, time_stamp=None): if time_stamp is None: time_stamp = epoch_time() hist["uid"] = uid if "he_ref" not in hist or "book" not in hist: oref = Ref(hist["ref"]) hist["he_ref"] = oref.he_normal() hist["book"] = oref.index.title hist["server_time_stamp"] = time_stamp if "server_time_stamp" not in hist else hist["server_time_stamp"] # DEBUG: helpful to include this field for debugging action = hist.pop("action", None) saved = True if action == "add_saved" else (False if action == "delete_saved" else hist.get("saved", False)) uh = UserHistory(hist, load_existing=(action is not None), update_last_place=(action is None), field_updates={ "saved": saved, "server_time_stamp": hist["server_time_stamp"] }) uh.save() return uh
def filter_invalid_sources(self): """ Remove any sources that don't validate """ sources = [] for source in self.sources: try: sources.append((Ref(source[0]).normal(), source[1])) except: pass self.sources = sources
def notes(self, tref=None): """ Returns notes for this layer, optionally filtered by notes on ref. """ query = {"_id": {"$in": self.note_ids}} if tref: query["ref"] = {"$regex": Ref(tref).section_ref().regex()} notes = NoteSet(query=query) return [note for note in notes]
def load_set_for_client(cls, tref: str): """ This method returns an array of results that can be converted to JSON instead of Sefaria MongoSet instances. This method uses a mongo aggregation to JOIN the manuscript with the manuscript page. :param tref: :return: """ try: oref = Ref(tref) except InputError: return [] segment_refs = [r.normal() for r in oref.all_segment_refs()] results, manuscripts = [], {} documents = cls.load_by_ref(oref) for document in documents: contained_refs, expanded = document.contained_refs, document.expanded_refs anchor_ref_list, anchor_ref_expanded_list = oref.get_all_anchor_refs( segment_refs, contained_refs, expanded) for anchor_ref, anchor_ref_expanded in zip( anchor_ref_list, anchor_ref_expanded_list): contents = document.contents() contents["anchorRef"] = anchor_ref.normal() contents["anchorRefExpanded"] = [ r.normal() for r in anchor_ref_expanded ] del contents['contained_refs'] del contents['expanded_refs'] if document.manuscript_slug in manuscripts: manuscript = manuscripts[document.manuscript_slug] else: manuscript = Manuscript().load( {'slug': document.manuscript_slug}) manuscripts[manuscript.slug] = manuscript man_contents = manuscript.contents() contents['manuscript'] = man_contents results.append(contents) return results
def add_ref(self, tref): try: new_oref = Ref(tref) except InputError as e: raise ManuscriptError(e) for oref in self.get_ref_objects(): if oref.overlaps(new_oref): raise ManuscriptError( f'Overlap between contained refs {oref} and {new_oref}') self.contained_refs.append(tref) self.expanded_refs.extend(self.get_expanded_refs_for_source(new_oref))
def contents(self, **kwargs): from sefaria.sheets import get_sheet_listing_data d = super(UserHistory, self).contents(**kwargs) if kwargs.get("for_api", False): keys = { 'ref': '', 'he_ref': '', 'book': '', 'versions': {}, 'time_stamp': 0, 'saved': False, 'delete_saved': False, 'is_sheet': False, 'sheet_id': -1, 'sheet_owner': '', 'sheet_title': '', } d = { key: d.get(key, default) for key, default in list(keys.items()) } if kwargs.get("annotate", False): try: ref = Ref(d["ref"]) if ref.is_sheet(): d.update(get_sheet_listing_data(d["sheet_id"])) else: d["text"] = { "en": TextChunk(ref, "en").as_sized_string(), "he": TextChunk(ref, "he").as_sized_string() } except Exception as e: logger.warning( "Failed to retrieve text for history Ref: {}".format( d['ref'])) return d return d
def process_index_title_change_in_user_history(indx, **kwargs): print "Cascading User History from {} to {}".format(kwargs['old'], kwargs['new']) # ensure that the regex library we're using here is the same regex library being used in `Ref.regex` from text import re as reg_reg patterns = [pattern.replace(reg_reg.escape(indx.title), reg_reg.escape(kwargs["old"])) for pattern in Ref(indx.title).regex(as_list=True)] queries = [{'ref': {'$regex': pattern}} for pattern in patterns] objs = UserHistorySet({"$or": queries}) for o in objs: o.ref = o.ref.replace(kwargs["old"], kwargs["new"], 1) try: o.save() except InputError: logger.warning(u"Failed to convert user history from: {} to {}".format(kwargs['old'], kwargs['new']))
def test_link_set(self, topic_graph): ts = topic_graph['topics'] ls = ts['1'].link_set(_class='intraTopic') assert list(ls)[0].topic == '2' assert ls.count() == 1 ls = ts['4'].link_set(_class='intraTopic') assert {l.topic for l in ls} == {'2', '5'} trefs = {r.normal() for r in Ref('Genesis 1:1-10').range_list()} ls = ts['1'].link_set(_class='refTopic') assert {l.ref for l in ls} == trefs ls = ts['1'].link_set(_class=None) assert {getattr(l, 'ref', getattr(l, 'topic', None)) for l in ls} == (trefs | {'2'})
def test(book): qa_issues = open('Ibn Ezra on {} misalignments.txt'.format(book), 'w') levi = parse(file_data[book]) vtitle = 'Devarim' if book == 'Deuteronomy' else book torat_emet = Ref("Ibn Ezra on {}".format(book)).text('he', 'Ibn Ezra on {} -- Torat Emet'.format(vtitle)).ja().array() count = 0 for c_index, (my_chapter, thier_chapter) in enumerate(zip(levi, torat_emet)): for v_index, (my_verse, their_verse) in enumerate(zip(my_chapter, thier_chapter)): if len(my_verse) != len(their_verse): qa_issues.write('issue found at {}:{}\n'.format(c_index+1, v_index+1)) count += 1 if len(my_chapter) != len(thier_chapter): by_length = sorted((my_chapter, thier_chapter), key=lambda x:len(x)) for i in range(len(by_length[0]), len(by_length[1])): qa_issues.write('issue found at {}:{}\n'.format(c_index+1, i+1)) count += 1 qa_issues.close() print '{} issues found'.format(count) ja_to_xml(levi, ['Chapter', 'Verse', 'Comment'])
def _validate(self): super(ManuscriptPage, self)._validate() # check that the manuscript this page is part of exists in the database if self.get_manuscript() is None: raise ManuscriptError("Manuscript missing in database") for tref in self.contained_refs: if not Ref.is_ref(tref): raise ManuscriptError(f'{tref} is not a valid Ref') test_refs = self.get_ref_objects() while test_refs: current_ref = test_refs.pop() for tr in test_refs: if current_ref.overlaps(tr): raise ManuscriptError( f'Overlap between contained refs {tr} and {current_ref}' ) if not len(test_refs): break
def xformer(recent): try: return { "uid": uid, "ref": recent[0], "he_ref": recent[1], "book": Ref(recent[0]).index.title, "last_place": True, "time_stamp": epoch_time(parser.parse(recent[2]).replace(tzinfo=None)) if recent[2] is not None else default_epoch_time, "server_time_stamp": epoch_time(parser.parse(recent[2]).replace(tzinfo=None)) if recent[2] is not None else default_epoch_time, "num_times_read": (recent[3] if recent[3] and isinstance(recent[3], int) else 1), # we dont really know how long they've read this book. it's probably correlated with the number of times they opened the book "versions": { "en": recent[4], "he": recent[5] } } except InputError: return None except ValueError: return None except IndexError: return None except AttributeError: return None
def parse_and_upload(): cards = get_cards() links = [] for card in cards: node = JaggedArrayNode() node.add_title(card, 'en', primary=True) node.add_title(u'רמב"ם ' + Ref(card.replace('Rambam ', '')).he_normal(), 'he', primary=True) node.key = card node.depth = 3 node.addressTypes = ['Integer', 'Integer', 'Integer'] node.sectionNames = ['Chapter', 'Mishnah', 'Comment'] node.validate() node.toc_zoom = 2 index = { 'title': card, 'categories': ['Commentary2', 'Mishnah', 'Rambam'], 'schema': node.serialize(), } parsed = parser(card) links.extend(parsed['links']) version = { 'versionTitle': u'Vilna Edition', 'versionSource': 'http://primo.nli.org.il/primo_library/libweb/action/dlDisplay.do?vid=NLI&docId=NNL_ALEPH001300957', 'language': 'he', 'text': parsed['parsed'] } print 'posting {}'.format(card) post_index(index) post_text(card, version, index_count='on') post_link(links)
def topic_graph(): isa_links = [ (1, 2), (2, 3), (2, 4), (4, 5), (6, 5), ] trefs = [r.normal() for r in Ref('Genesis 1:1-10').range_list()] for a, b in isa_links: clean_links(str(a)) clean_links(str(b)) graph = { 'topics': {str(i): make_topic(str(i)) for i in range(1, 10)}, 'links': [make_it_link(str(a), str(b), 'is-a') for a, b in isa_links] + [make_rt_link('1', r) for r in trefs] } yield graph for k, v in graph['topics'].items(): v.delete() for v in graph['links']: v.delete()
def test_double_ref(self): ref = library.get_refs_in_string(texts['he_2ref']) assert 2 == len(ref) assert {Ref(u'הושע ט ג'), Ref(u'דברי הימים ב לב יט')} == set(ref)
def _normalize(self): self.ref = Ref(self.ref).normal() self.text = bleach.clean(self.text, tags=self.allowed_tags, attributes=self.allowed_attrs)
def test_double_quote_talmud(self): ref = library.get_refs_in_string(texts['dq_talmud']) assert 1 == len(ref) assert Ref(u'יבמות ס"ה') == ref[0]