def test_invalid_index_save_no_existing_base_text(): title = 'Bartenura (The Next Generation)' model.IndexSet({"title": title}).delete() d = { "categories" : [ "Mishnah", "Commentary", "Bartenura", "Seder Zeraim" ], "base_text_titles": ["Gargamel"], "title" : title, "schema" : { "titles" : [ { "lang" : "en", "text" : title, "primary" : True }, { "lang" : "he", "text" : "פרשן", "primary" : True } ], "nodeType" : "JaggedArrayNode", "depth" : 2, "sectionNames" : [ "Section", "Line" ], "addressTypes" : [ "Integer", "Integer" ], "key": title }, } idx = model.Index(d) with pytest.raises(InputError) as e_info: assert "Base Text Titles must point to existing texts in the system." in str(e_info.value) assert model.IndexSet({"title": title}).count() == 0
def teardown_module(module): titles = [ 'Test Commentator Name', 'Bartenura (The Next Generation)', 'Test Index Name', "Changed Test Index", "Third Attempt", "Test Iu", "Test Del" ] for title in titles: model.IndexSet({"title": title}).delete() model.VersionSet({"title": title}).delete()
def test_dup_index_save(): title = 'Test Commentator Name' model.IndexSet({"title": title}).delete() d = { "categories": ["Liturgy"], "title": title, "schema": { "titles": [{ "lang": "en", "text": title, "primary": True }, { "lang": "he", "text": "פרשן", "primary": True }], "nodeType": "JaggedArrayNode", "depth": 2, "sectionNames": ["Section", "Line"], "addressTypes": ["Integer", "Integer"], "key": title }, } idx = model.Index(d) assert model.IndexSet({"title": title}).count() == 1 try: d2 = { "title": title, "heTitle": u"פרשן ב", "titleVariants": [title], "sectionNames": ["Chapter", "Paragraph"], "categories": ["Commentary"], "lengths": [50, 501] } idx2 = model.Index(d2).save() except: pass assert model.IndexSet({"title": title}).count() == 1
def test_invalid_index_save_no_category(): title = 'Bartenura (The Next Generation)' model.IndexSet({"title": title}).delete() d = { "categories" : [ "Mishnah", "Commentary", "Bartenura", "Gargamel" ], "title" : title, "schema" : { "titles" : [ { "lang" : "en", "text" : title, "primary" : True }, { "lang" : "he", "text" : "פרשן", "primary" : True } ], "nodeType" : "JaggedArrayNode", "depth" : 2, "sectionNames" : [ "Section", "Line" ], "addressTypes" : [ "Integer", "Integer" ], "key": title }, } idx = model.Index(d) with pytest.raises(InputError) as e_info: assert "You must create category Mishnah/Commentary/Bartenura/Gargamel before adding texts to it." in str(e_info.value) assert model.IndexSet({"title": title}).count() == 0
def test_invalid_index_save_no_hebrew_collective_title(): title = 'Bartenura (The Next Generation)' model.IndexSet({"title": title}).delete() d = { "categories" : [ "Mishnah", "Rishonim on Mishnah", "Bartenura" ], "collective_title": 'Gargamel', "title" : title, "schema" : { "titles" : [ { "lang" : "en", "text" : title, "primary" : True }, { "lang" : "he", "text" : "פרשן", "primary" : True } ], "nodeType" : "JaggedArrayNode", "depth" : 2, "sectionNames" : [ "Section", "Line" ], "addressTypes" : [ "Integer", "Integer" ], "key": title }, } idx = model.Index(d) with pytest.raises(InputError) as e_info: assert "You must add a hebrew translation Term for any new Collective Title: Gargamel." in str(e_info.value) assert model.IndexSet({"title": title}).count() == 0
def get_book_link_collection(book, cat): if cat == "Tanach" or cat == "Torah" or cat == "Prophets" or cat == "Writings": query = { "$and": [{ "categories": cat }, { "categories": { "$ne": "Commentary" } }, { "categories": { "$ne": "Targum" } }] } else: query = {"categories": cat} titles = model.IndexSet(query).distinct("title") if len(titles) == 0: return {"error": "No results for {}".format(query)} book_re = r'^{} \d'.format(book) cat_re = r'^({}) \d'.format('|'.join(titles)) link_re = r'^(?P<title>.+) (?P<loc>\d.*)$' ret = [] links = model.LinkSet({ "$and": [{ "refs": { "$regex": book_re } }, { "refs": { "$regex": cat_re } }] }) for link in links: l1 = re.match(link_re, link.refs[0]) l2 = re.match(link_re, link.refs[1]) ret.append({ "r1": { "title":"title").replace(" ", "-"), "loc":"loc") }, "r2": { "title":"title").replace(" ", "-"), "loc":"loc") } }) return ret
def setup_class(cls): model.IndexSet({ "title": { "$in": [ "New Toc Title Test", "New Toc Test", "Another New Toc Test", "Harchev Davar on Joshua" ] } }).delete() model.library.rebuild_toc() cls.toc = model.library.get_toc() cls.search_toc = model.library.get_search_filter_toc()
def export_schemas(): path = SEFARIA_EXPORT_PATH + "/schemas/" if not os.path.exists(path): os.makedirs(path) for i in model.IndexSet(): title = i.title.replace(" ", "_") with open(path + title + ".json", "w") as f: try: f.write(make_json(i.contents(v2=True)).encode('utf-8')) except InputError as e: print "InputError: %s" % e with open(SEFARIA_EXPORT_PATH + "/errors.log", "a") as error_log: error_log.write("%s - InputError: %s\n" % (, e))
def rename_category(old, new): """ Walk through all index records, replacing every category instance called 'old' with 'new'. """ indices = model.IndexSet({"categories": old}) assert indices.count(), "No categories named {}".format(old) for i in indices: i.categories = [new if cat == old else cat for cat in i.categories] summaries.update_summaries()
def update_counts(ref=None): """ Update the count records of all texts or the text specfied by ref (currently at book level only) by peforming a count """ if ref: update_text_count(ref) return indices = model.IndexSet() for index in indices: if index.is_commentary(): cRef = "^{} on ".format(index.title) texts = model.VersionSet({"title": {"$regex": cRef}}).distinct("title") for text in texts: update_text_count(text) else: update_text_count(index.title) summaries.update_summaries()
def remove_old_counts(): """ Deletes counts documents which no longer correspond to a text or category. """ # counts = model.CountSet() # If there are counts documents save in the DB with invalid titles, # instantiation of the Count will cause a BookNameError. # But in this code instantiation happens in the line 'for count in counts' # How do we catch that? Additionally, we need access to the bad title after # The error has occurred. How would we get that? Reverting to direct DB call for now. counts = db.counts.find() for count in counts: if count.get("title", None): try: model.get_index(count["title"]) except BookNameError: print u"Old count: %s" % count["title"] #count.delete() db.counts.remove({"_id": count["_id"]}) else: #TODO incomplete for Category Counts. continue categories = count.categories i = model.IndexSet({ "$and": [{ 'categories.0': categories[0] }, { "categories": { "$all": categories } }, { "categories": { "$size": len(categories) } }] }) if not i.count(): print "Old category %s" % " > ".join(categories)
def export_schemas(): for i in model.IndexSet(): title = i.title.replace(" ", "_") with open(SEFARIA_DATA_PATH + "/export/schemas/" + title, "w") as f: f.write(make_json(i.contents()))
def teardown_class(cls): titles = ["New Toc Title Test", "New Toc Test", "Another New Toc Test", "Harchev Davar on Joshua", "Bob is your Uncle"] for title in titles: model.IndexSet({"title": title}).delete() model.VersionSet({"title": title}).delete()
def dep_counts(name): commentators = model.IndexSet({ "categories.0": "Commentary" }).distinct("title") ref_patterns = { 'alone': r'^{} \d'.format(re.escape(name)), 'commentor': r'{} on'.format(re.escape(name)), 'commentee': r'^({}) on {} \d'.format("|".join(commentators), re.escape(name)) } commentee_title_pattern = r'^({}) on {} \d'.format("|".join(commentators), re.escape(name)) ret = { 'version title exact match': model.VersionSet({ "title": name }).count(), 'version title match commentor': model.VersionSet({ "title": { "$regex": ref_patterns["commentor"] } }).count(), 'version title match commentee': model.VersionSet({ "title": { "$regex": commentee_title_pattern } }).count(), 'history title exact match': model.HistorySet({ "title": name }).count(), 'history title match commentor': model.HistorySet({ "title": { "$regex": ref_patterns["commentor"] } }).count(), 'history title match commentee': model.HistorySet({ "title": { "$regex": commentee_title_pattern } }).count(), } for pname, pattern in ref_patterns.items(): ret.update({ 'note match ' + pname: model.NoteSet({ "ref": { "$regex": pattern } }).count(), 'link match ' + pname: model.LinkSet({ "refs": { "$regex": pattern } }).count(), 'history refs match ' + pname: model.HistorySet({ "ref": { "$regex": pattern } }).count(), 'history new refs match ' + pname: model.HistorySet({ "new.refs": { "$regex": pattern } }).count() }) return ret
def count_category(cat, lang=None): """ Count the number of sections of various types in an entire category and calculate percentages Depends on text counts already being saved in counts collection """ if not lang: # If no language specified, return a dict with English and Hebrew, # grouping hebrew and english fields cat = [cat] if isinstance(cat, basestring) else cat en = count_category(cat, "en") he = count_category(cat, "he") counts = { "percentAvailable": { "he": he["percentAvailable"], "en": en["percentAvailable"] }, "availableCounts": { "he": he["availableCounts"], "en": en["availableCounts"] } } counts["textComplete"] = { "he": he["percentAvailable"] > 99.5, "en": en["percentAvailable"] > 99.5, } # Save to the DB remove_doc = {"$and": [{'categories.0': cat[0]}, {"categories": {"$all": cat}}, {"categories": {"$size": len(cat)}} ]} db.counts.remove(remove_doc) counts_doc = {"categories": cat} counts_doc.update(counts) return counts # Count this cateogry counts = defaultdict(int) percent = 0.0 percentCount = 0 cat = [cat] if isinstance(cat, basestring) else cat indxs = model.IndexSet({"$and": [{'categories.0': cat[0]}, {"categories": {"$all": cat}}]}) for indx in indxs: counts["Text"] += 1 text_count = model.Count().load({ "title": indx.title }) if not text_count or not hasattr(text_count, "availableCounts") or not hasattr(indx, "sectionNames"): continue c = text_count.availableCounts[lang] for i in range(len(indx.sectionNames)): if len(c) > i: counts[indx.sectionNames[i]] += c[i] if hasattr(text_count, "percentAvailable") and isinstance(percent, float): percentCount += 1 percent += text_count.percentAvailable[lang] if isinstance(text_count.percentAvailable[lang], float) else 0.0 else: percent = "unknown" percentCount = 1 if percentCount == 0 else percentCount percent = percent / percentCount if isinstance(percent, float) else "unknown" if "Daf" in counts: counts["Amud"] = counts["Daf"] counts["Daf"] = counts["Daf"] / 2 return { "availableCounts": dict(counts), "percentAvailable": percent }
def test_index_delete(): #Simple Text ti = "Test Del" model.IndexSet({"title": ti}).delete() model.VersionSet({"title": ti}).delete() i = model.Index({ "title": ti, "heTitle": u"כבכב", "titleVariants": [ti], "sectionNames": ["Chapter", "Paragraph"], "categories": ["Musar"], "lengths": [50, 501] }).save() new_version1 = model.Version({ "chapter": i.nodes.create_skeleton(), "versionTitle": "Version 1 TEST", "versionSource": "blabla", "language": "he", "title": i.title }) new_version1.chapter = [[u''], [u''], [u"לה לה לה לא חשוב על מה"]] new_version2 = model.Version({ "chapter": i.nodes.create_skeleton(), "versionTitle": "Version 2 TEST", "versionSource": "blabla", "language": "en", "title": i.title }) new_version2.chapter = [[], ["Hello goodbye bla bla blah"], []] i.delete() assert model.Index().load({'title': ti}) is None assert model.VersionSet({'title': ti}).count() == 0 #Commentator from sefaria.helper.text import create_commentator_and_commentary_version commentator_name = "Commentator Del" he_commentator_name = u"פרשנדנן" base_book = 'Genesis' base_book2 = 'Pesach Haggadah' model.IndexSet({"title": commentator_name}).delete() model.VersionSet({"title": commentator_name + " on " + base_book}).delete() model.VersionSet({ "title": commentator_name + " on " + base_book2 }).delete() create_commentator_and_commentary_version(commentator_name, base_book, 'he', 'test', 'test', he_commentator_name) create_commentator_and_commentary_version(commentator_name, base_book2, 'he', 'test', 'test', he_commentator_name) ci = model.Index().load({'title': commentator_name}).delete() assert model.Index().load({'title': commentator_name}) is None assert model.VersionSet({ 'title': { '$regex': commentator_name } }).count() == 0
import sefaria.model as model from sefaria.system.database import db from sefaria.clean import remove_old_counts # Move the history books model.IndexSet({"categories":"History"}).update({"categories": [u'Apocrypha']}) anaBekhoach = model.Index().load({'title': 'Ana BeKhoach'}) anaBekhoach.categories = [u'Liturgy',u'Piyutim'] #why doesn't update() work on an instance? model.IndexSet({"title":{"$regex": "Rabbah?"}}).update({"categories": ['Midrash', 'Aggadic Midrash', 'Midrash Rabbah']}) #this one should not have been updated. model.Index().update({'title': 'Tanna Debei Eliyahu Rabbah'}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.IndexSet({'title': {"$regex" : 'Ein Yaakov'}}).update({'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': 'Midrash Tanchuma'}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': 'Legends of the Jews'}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': 'Midrash Mishlei'}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': 'Pirkei Derabi Eliezer'}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': 'Midrash on Proverbs'}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': "Midrash B'not Zelophehad"}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': 'Midrash Tehilim'}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': 'Pesikta de rav kahana'}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': 'The Fathers according to Rabbi Nathan'}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': 'Yalkut Shimoni'}, {'categories': ['Midrash', 'Aggadic Midrash']}) model.Index().update({'title': 'Sifra'}, {'categories': ['Midrash', 'Halachic Midrash']})
import json import pytest import sefaria.summaries as s import sefaria.model as model import sefaria.system.cache as scache from sefaria.system.exceptions import BookNameError from sefaria.utils.testing_utils import * #create, update, delete, change categories # test that old title goes away on index title change (regular + commentary) # test that no commentator is added # no wandering commentaries """ SOME SETUP """ text_titles = model.IndexSet({}).distinct('title') model.library.rebuild_toc() """ THE TESTS """ class Test_Toc(object): @classmethod def setup_class(cls): model.library.rebuild_toc() @classmethod def teardown_class(cls): titles = [ "New Toc Title Test", "New Toc Test", "Another New Toc Test", "Harchev Davar on Joshua", "Bob is your Uncle" ]
import sefaria.model as model from sefaria.system.database import db from sefaria.clean import remove_old_counts from sefaria.counts import update_counts # Remove duplicate 'Sefer Abudraham' db.index.remove({"title": "Sefer Abudraham "}) db.index.remove({"title": "Tiferet Yisrael "}) db.index.remove({"title": "Igrot Moshe "}) db.index.remove({"title": "The Sabbath, Heschel "}) db.index.remove({"title": "Sifre Devarim "}) remove_old_counts() texts = model.IndexSet({}) for t in texts: if t.title != t.title.strip(): t.title = t.title.strip() ns = model.NoteSet({"public": {"$exists": False}}) for n in ns: if not getattr(n, "owner", None): n.owner = 1 ns.update({"public": False}) # Remove "sectionCounts" field form sectionCounts db.counts.update({}, {"$unset": {"sectionCounts": ""}}, multi=True)
def export_schemas(): for i in model.IndexSet(): title = i.title.replace(" ", "_") with open(SEFARIA_DATA_PATH + "/export/schemas/" + title + ".json", "w") as f: f.write(make_json(i.contents(v2=True)).encode('utf-8'))