def remove_old_counts(): """ Deletes counts documents which no longer correspond to a text or category. """ # counts = model.CountSet() # If there are counts documents save in the DB with invalid titles, # instantiation of the Count will cause a BookNameError. # But in this code instantiation happens in the line 'for count in counts' # How do we catch that? Additionally, we need access to the bad title after # The error has occurred. How would we get that? Reverting to direct DB call for now. counts = db.vstate.find() for count in counts: if count.get("title", None): try: model.get_index(count["title"]) except BookNameError: print u"Old count: %s" % count["title"] #count.delete() db.vstate.remove({"_id": count["_id"]}) else: #TODO incomplete for Category Counts. continue categories = count.categories i = model.IndexSet({"$and": [{'categories.0': categories[0]}, {"categories": {"$all": categories}}, {"categories": {"$size": len(categories)}} ]}) if not i.count(): print "Old category %s" % " > ".join(categories)
def test_get_index(): r = model.get_index("Rashi on Exodus") assert isinstance(r, model.CommentaryIndex) assert r.titleVariants == [u'Rashi on Exodus'] r = model.get_index("Exodus") assert isinstance(r, model.Index) assert r.title == u'Exodus'
def test_get_index(): r = model.get_index("Rashi on Exodus") assert isinstance(r, model.CommentaryIndex) assert u"Rashi on Exodus" == r.title assert u"Rashi on Exodus" in r.titleVariants assert u"Rashi" not in r.titleVariants assert u"Exodus" not in r.titleVariants r = model.get_index("Exodus") assert isinstance(r, model.Index) assert r.title == u"Exodus"
def test_get_index(): r = model.get_index("Rashi on Exodus") assert isinstance(r, model.CommentaryIndex) assert u'Rashi on Exodus' == r.title assert u'Rashi on Exodus' in r.titleVariants assert u'Rashi' not in r.titleVariants assert u'Exodus' not in r.titleVariants r = model.get_index("Exodus") assert isinstance(r, model.Index) assert r.title == u'Exodus'
def export_text(text): """ Exports 'text' (a document from the texts collection, or virtual merged document) by preparing it as a export document and passing to 'export_text_doc'. """ print text["title"] try: index = model.get_index(text["title"]) except Exception as e: print "Skipping %s - %s" % (text["title"], e.message) return if index.is_complex(): # TODO handle export of complex texts print "Skipping Complex Text: %s - " % (text["title"]) return text["heTitle"] = index.nodes.primary_title("he") text["categories"] = index.categories text["sectionNames"] = index.schema["sectionNames"] text["text"] = text.get("text", None) or text.get("chapter", "") if "_id" in text: del text["_id"] del text["chapter"] export_text_doc(text)
def generate_refs_list(query={}): """ Generate a list of refs to all available sections. """ trefs = [] counts = db.counts.find(query) for c in counts: if "title" not in c: continue # this is a category count try: i = model.get_index(c["title"]) except Exception: db.counts.remove(c) continue # If there is not index record to match the count record, # the count should be removed. title = c["title"] he = list_from_counts(c["availableTexts"]["he"]) en = list_from_counts(c["availableTexts"]["en"]) sections = texts.union(he, en) for n in sections: if i.categories[0] == "Talmud": n = section_to_daf(int(n)) if getattr(i, "commentaryCategories", None) and i.commentaryCategories[0] == "Talmud": split = n.split(":") n = ":".join([section_to_daf(int(n[0]))] + split[1:]) tref = "%s %s" % (title, n) if n else title trefs.append(tref) return trefs
def text_category(text): """Returns the top level category for text""" try: i = m.get_index(text) result = mark_safe(getattr(i, "categories", ["[no cats]"])[0]) except: result = "[text not found]" return result
def export_index(title): """ Writes the JSON of the index record of the text called `title`. """ index = model.get_index(title) index = index.contents(v2=True) path = "%s/%s_index.json" % (SEFARIA_EXPORT_PATH, title) write_doc(index, path)
def remove_old_counts(): """ Deletes counts documents which no longer correspond to a text or category. """ # If there are counts documents save in the DB with invalid titles, # instantiation of the Count will cause a BookNameError. # But in this code instantiation happens in the line 'for count in counts' # How do we catch that? Additionally, we need access to the bad title after # The error has occurred. How would we get that? Reverting to direct DB call for now. counts = db.vstate.find({}, {"title": 1}) for count in counts: if count.get("title", None): try: model.get_index(count["title"]) except BookNameError: print u"Old count: %s" % count["title"] db.vstate.remove({"_id": count["_id"]})
def reset_counts(request, title=None): if title: i = model.get_index(title) vs = model.VersionState(index=i) vs.refresh() return HttpResponseRedirect("/%s?m=Counts-Rebuilt" % model.Ref(i.title).url()) else: model.refresh_all_states() return HttpResponseRedirect("/?m=Counts-Rebuilt")
def make_text(doc): """ Export doc into a simple text format. if complex, go through nodes depth first, at each node, output name of node if node is leaf, run flatten on it """ index = model.get_index(doc["title"]) text = "\n".join([doc["title"], doc.get("heTitle", ""), doc["versionTitle"], doc["versionSource"]]) version = Version().load({'title': doc["title"], 'versionTitle': doc["versionTitle"], 'language': doc["language"]}) isMerged = (doc["versionTitle"] == "merged") if "versions" in doc: if isMerged: version = Version().load({'title': doc["title"], 'versionTitle': doc["versions"][0][0], 'language': doc["language"]}) text += "\nThis file contains merged sections from the following text versions:" for v in doc["versions"]: text += "\n-%s\n-%s" % (v[0], v[1]) def make_node(node, depth, **kwargs): if node.is_leaf(): content = "\n\n%s" % node.primary_title(doc["language"]) content += flatten(version.content_node(node), node.sectionNames) return "\n\n%s" % content else: return "\n\n%s" % node.primary_title(doc["language"]) def flatten(text, sectionNames): text = text or "" if len(sectionNames) == 1: text = [t if t else "" for t in text] # Bandaid for mismatch between text structure, join recursively if text # elements are lists instead of strings. return "\n".join([t if isinstance(t, basestring) else "\n".join(t) for t in text]) flat = "" for i in range(len(text)): section = section_to_daf(i + 1) if sectionNames[0] == "Daf" else str(i + 1) flat += "\n\n%s %s\n\n%s" % (sectionNames[0], section, flatten(text[i], sectionNames[1:])) return flat text += index.nodes.traverse_to_string(make_node) return text
def remove_old_counts(): """ Deletes counts documents which no longer correspond to a text or category. """ # counts = model.CountSet() # If there are counts documents save in the DB with invalid titles, # instantiation of the Count will cause a BookNameError. # But in this code instantiation happens in the line 'for count in counts' # How do we catch that? Additionally, we need access to the bad title after # The error has occurred. How would we get that? Reverting to direct DB call for now. counts = db.counts.find() for count in counts: if count.get("title", None): try: model.get_index(count["title"]) except BookNameError: print u"Old count: %s" % count["title"] #count.delete() db.counts.remove({"_id": count["_id"]}) else: #TODO incomplete for Category Counts. continue categories = count.categories i = model.IndexSet({ "$and": [{ 'categories.0': categories[0] }, { "categories": { "$all": categories } }, { "categories": { "$size": len(categories) } }] }) if not i.count(): print "Old category %s" % " > ".join(categories)
def export_text(text): """ Iterates through all text documents, writing a document to disk according to formats in export_formats """ print text["title"] try: index = model.get_index(text["title"]) except Exception as e: print "Skipping %s - %s" % (text["title"], e.message) return text.update(index.contents()) del text["_id"] text["text"] = text.pop("chapter") export_text_doc(text)
def export_text(text): """ Exports 'text' (a document from the texts collection, or virtual merged document) by preparing it as a export document and passing to 'export_text_doc'. """ print text["title"] try: index = model.get_index(text["title"]) except Exception as e: print "Skipping %s - %s" % (text["title"], e.message) return text["heTitle"] = index.nodes.primary_title("he") text["categories"] = index.categories text["text"] = text.get("text", None) or text.get("chapter", "") if index.is_complex(): def min_node_props(node, depth, **kwargs): js = {"heTitle": node.primary_title("he"), "enTitle": node.primary_title("en"), "key": node.key} return js def key2title(text_node, schema_node): for temp_schema_node in schema_node: new_key = temp_schema_node["enTitle"] text_node[new_key] = text_node.pop(temp_schema_node["key"]) del temp_schema_node["key"] if "nodes" in temp_schema_node: key2title(text_node[new_key], temp_schema_node["nodes"]) text["schema"] = index.nodes.traverse_to_json(min_node_props) key2title(text["text"], text["schema"]["nodes"]) else: text["sectionNames"] = index.schema["sectionNames"] if "_id" in text: del text["_id"] del text["chapter"] export_text_doc(text)
def update_links_count(text=None): """ Counts the links that point to a particular text, or all of them Results are stored them on the 'linksCount' field of the counts document """ if not text: counts = db.counts.find({"title": {"$exists": 1}}) for c in counts: if c["title"]: update_links_count(text=c["title"]) print "%s" % text index = model.get_index(text) #This is likely here just to catch any exceptions that are thrown c = { "title": text } c = db.counts.find_one(c) c["linksCount"] = model.LinkSet(model.Ref(text)).count() #db.links.find({"refs": {"$regex": model.Ref(text).regex()}}).count() db.counts.save(c)
def hebrew_term(s): """ Simple translations for common Hebrew words """ categories = { "Torah": u"תורה", "Tanach": u'תנ"ך', "Tanakh": u'תנ"ך', "Prophets": u"נביאים", "Writings": u"כתובים", "Commentary": u"מפרשים", "Targum": u"תרגומים", "Mishnah": u"משנה", "Tosefta": u"תוספתא", "Talmud": u"תלמוד", "Bavli": u"בבלי", "Yerushalmi": u"ירושלמי", "Rif": u'רי"ף', "Kabbalah": u"קבלה", "Halakha": u"הלכה", "Halakhah": u"הלכה", "Midrash": u"מדרש", "Aggadic Midrash": u"מדרש אגדה", "Halachic Midrash": u"מדרש הלכה", "Midrash Rabbah": u"מדרש רבה", "Responsa": u'שו"ת', "Other": u"אחר", "Siddur": u"סידור", "Liturgy": u"תפילה", "Piyutim": u"פיוטים", "Musar": u"ספרי מוסר", "Chasidut": u"חסידות", "Parshanut": u"פרשנות", "Philosophy": u"מחשבת ישראל", "Maharal": u'מהר"ל מפראג', "Apocrypha": u"ספרים חיצונים", "Seder Zeraim": u"סדר זרעים", "Seder Moed": u"סדר מועד", "Seder Nashim": u"סדר נשים", "Seder Nezikin": u"סדר נזיקין", "Seder Kodashim": u"סדר קדשים", "Seder Toharot": u"סדר טהרות", "Seder Tahorot": u"סדר טהרות", "Dictionary": u"מילון", "Early Jewish Thought": u"מחשבת ישראל קדומה", "Minor Tractates": u"מסכתות קטנות", "Rosh": u'רא"ש', "Maharsha": u'מהרשא', "Rashba": u'רשב"א', "Rambam": u'רמב"ם', "Radbaz": u'רדב"ז', "Tosafot Yom Tov": u"תוספות יום טוב", "Chidushei Halachot": u"חידושי הלכות", "Chidushei Agadot": u"חידושי אגדות", "Tiferet Shmuel": u"תפארת שמואל", "Korban Netanel": u"קרבן נתנאל", "Pilpula Charifta": u"פילפולא חריפתא", "Divrey Chamudot": u"דברי חמודות", "Maadaney Yom Tov": u"מעדני יום טוב", "Modern Works": u"יצירות מודרניות", } pseudo_categories = { "Mishneh Torah": u"משנה תורה", 'Introduction': u"הקדמה", 'Sefer Madda': u"ספר מדע", 'Sefer Ahavah': u"ספר אהבה", 'Sefer Zemanim': u"ספר זמנים", 'Sefer Nashim': u"ספר נשים", 'Sefer Kedushah': u"ספר קדושה", 'Sefer Haflaah': u"ספר הפלאה", 'Sefer Zeraim': u"ספר זרעים", 'Sefer Avodah': u"ספר עבודה", 'Sefer Korbanot': u"ספר קורבנות", 'Sefer Taharah': u"ספר טהרה", 'Sefer Nezikim': u"ספר נזיקין", 'Sefer Kinyan': u"ספר קניין", 'Sefer Mishpatim': u"ספר משפטים", 'Sefer Shoftim': u"ספר שופטים", "Shulchan Arukh": u"שולחן ערוך", } section_names = { "Chapter": u"פרק", "Chapters": u"פרקים", "Perek": u"פרק", "Line": u"שורה", "Daf": u"דף", "Paragraph": u"פסקה", "Parsha": u"פרשה", "Parasha": u"פרשה", "Parashah": u"פרשה", "Seif": u"סעיף", "Se'if": u"סעיף", "Siman": u"סימן", "Section": u"חלק", "Verse": u"פסוק", "Sentence": u"משפט", "Sha'ar": u"שער", "Gate": u"שער", "Comment": u"פירוש", "Phrase": u"ביטוי", "Mishna": u"משנה", "Chelek": u"חלק", "Helek": u"חלק", "Year": u"שנה", "Masechet": u"מסכת", "Massechet": u"מסכת", "Letter": u"אות", "Halacha": u"הלכה", "Seif Katan": u"סעיף קטן", "Se'if Katan": u"סעיף קטן", "Volume": u"כרך", "Book": u"ספר", "Shar": u"שער", "Seder": u"סדר", "Part": u"חלק", "Pasuk": u"פסוק", "Sefer": u"ספר", "Teshuva": u"תשובה", "Teshuvot": u"תשובות", "Tosefta": u"תוספתא", "Halakhah": u"הלכה", "Kovetz": u"קובץ", "Path": u"נתיב", "Parshah": u"פרשה", "Midrash": u"מדרש", "Mitzvah": u"מצוה", "Tefillah": u"תפילה", "Torah": u"תורה", "Perush": u"פירוש", "Peirush": u"פירוש", "Aliyah": u"עלייה", "Tikkun": u"תיקון", "Tikkunim": u"תיקונים", "Hilchot": u"הילכות", "Topic": u"נושא", "Contents": u"תוכן" } words = dict(categories.items() + pseudo_categories.items() + section_names.items()) if s in words: return words[s] # If s is a text title, look for a stored Hebrew title try: from sefaria.model import get_index, IndexSet from sefaria.system.exceptions import BookNameError i = get_index(s) return i.get_title("he") except BookNameError: pass return s
def update_text_count(book_title): """ Update the count records of the text specfied by ref (currently at book level only) by peforming a count """ index = model.get_index(book_title) c = { "title": book_title } existing = db.counts.find_one(c) if existing: c = existing en = count_texts(book_title, lang="en") if "error" in en: # Still valid? return en he = count_texts(book_title, lang="he") if "error" in he: # Still valid? return he c["allVersionCounts"] = sum_count_arrays(en["counts"], he["counts"]) # totals is a zero filled JA representing to shape of total available texts # sum with each language to ensure counts have a 0 anywhere where they # are missing a segment totals = zero_jagged_array(c["allVersionCounts"]) enCount = sum_count_arrays(en["counts"], totals) heCount = sum_count_arrays(he["counts"], totals) c["availableTexts"] = { "en": enCount, "he": heCount, } c["availableCounts"] = { "en": en["lengths"], "he": he["lengths"], } if getattr(index, "length", None) and getattr(index, "lengths", None): depth = len(index.lengths) heTotal = enTotal = total = 0 for i in range(depth): heTotal += he["lengths"][i] enTotal += en["lengths"][i] total += index.lengths[i] if total == 0: hp = ep = 0 else: hp = heTotal / float(total) * 100 ep = enTotal / float(total) * 100 #temp check to see if text has wrong metadata leading to incorrect (to high) percentage """if hp > 100: print index.title, " in hebrew has stats out of order: ", heTotal, "/", total, "=", hp if ep > 100: print index.title, " in english has stats out of order: ", enTotal, "/", total, "=", ep""" elif getattr(index, "length", None): hp = c["availableCounts"]["he"][0] / float(index.length) * 100 ep = c["availableCounts"]["en"][0] / float(index.length) * 100 else: hp = ep = 0 c["percentAvailable"] = { "he": hp, "en": ep, } c["textComplete"] = { "he": hp > 99.9, "en": ep > 99.9, } #function to estimate how much of a text we have c['estimatedCompleteness'] = { "he" : estimate_completeness('he', index, c), "en" : estimate_completeness('en', index, c) } db.counts.save(c) return c
def hebrew_term(s): """ Simple translations for common Hebrew words """ categories = { "Torah": u"תורה", "Tanach": u'תנ"ך', "Tanakh": u'תנ"ך', "Prophets": u"נביאים", "Writings": u"כתובים", "Commentary": u"מפרשים", "Targum": u"תרגומים", "Mishnah": u"משנה", "Tosefta": u"תוספתא", "Talmud": u"תלמוד", "Bavli": u"בבלי", "Yerushalmi": u"ירושלמי", "Rif": u'רי"ף', "Kabbalah": u"קבלה", "Halakha": u"הלכה", "Halakhah": u"הלכה", "Midrash": u"מדרש", "Aggadic Midrash": u"מדרש אגדה", "Halachic Midrash": u"מדרש הלכה", "Midrash Rabbah": u"מדרש רבה", "Responsa": u'שו"ת', "Rashba": u'רשב"א', "Rambam": u'רמב"ם', "Other": u"אחר", "Siddur": u"סידור", "Liturgy": u"תפילה", "Piyutim": u"פיוטים", "Musar": u"ספרי מוסר", "Chasidut": u"חסידות", "Parshanut": u"פרשנות", "Philosophy": u"מחשבת ישראל", "Maharal": u'מהר"ל מפראג', "Apocrypha": u"ספרים חיצונים", "Seder Zeraim": u"סדר זרעים", "Seder Moed": u"סדר מועד", "Seder Nashim": u"סדר נשים", "Seder Nezikin": u"סדר נזיקין", "Seder Kodashim": u"סדר קדשים", "Seder Toharot": u"סדר טהרות", "Seder Tahorot": u"סדר טהרות", "Dictionary": u"מילון", "Early Jewish Thought": u"מחשבת ישראל קדומה", "Minor Tractates": u"מסכתות קטנות", "Rosh": u'ר"אש', "Maharsha": u'מהרשא', } pseudo_categories = { "Mishneh Torah": u"משנה תורה", 'Introduction': u"הקדמה", 'Sefer Madda': u"ספר מדע", 'Sefer Ahavah': u"ספר אהבה", 'Sefer Zemanim': u"ספר זמנים", 'Sefer Nashim': u"ספר נשים", 'Sefer Kedushah': u"ספר קדושה", 'Sefer Haflaah': u"ספר הפלאה", 'Sefer Zeraim': u"ספר זרעים", 'Sefer Avodah': u"ספר עבודה", 'Sefer Korbanot': u"ספר קורבנות", 'Sefer Taharah': u"ספר טהרה", 'Sefer Nezikim': u"ספר נזיקין", 'Sefer Kinyan': u"ספר קניין", 'Sefer Mishpatim': u"ספר משפטים", 'Sefer Shoftim': u"ספר שופטים", "Shulchan Arukh": u"שולחן ערוך", } section_names = { "Chapter": u"פרק", "Perek": u"פרק", "Line": u"שורה", "Daf": u"דף", "Paragraph": u"פסקה", "Parsha": u"פרשה", "Parasha": u"פרשה", "Parashah": u"פרשה", "Seif": u"סעיף", "Se'if": u"סעיף", "Siman": u"סימן", "Section": u"חלק", "Verse": u"פסוק", "Sentence": u"משפט", "Sha'ar": u"שער", "Gate": u"שער", "Comment": u"פירוש", "Phrase": u"ביטוי", "Mishna": u"משנה", "Chelek": u"חלק", "Helek": u"חלק", "Year": u"שנה", "Masechet": u"מסכת", "Massechet": u"מסכת", "Letter": u"אות", "Halacha": u"הלכה", "Seif Katan": u"סעיף קטן", "Se'if Katan": u"סעיף קטן", "Volume": u"כרך", "Book": u"ספר", "Shar": u"שער", "Seder": u"סדר", "Part": u"חלק", "Pasuk": u"פסוק", "Sefer": u"ספר", "Teshuva": u"תשובה", "Teshuvot": u"תשובות", "Tosefta": u"תוספתא", "Halakhah": u"הלכה", "Kovetz": u"קובץ", "Path": u"נתיבה", "Parshah": u"פרשה", "Midrash": u"מדרש", "Mitzvah": u"מצוה", "Tefillah": u"תפילה", "Torah": u"תורה", "Perush": u"פירוש", "Peirush": u"פירוש", "Aliyah": u"עלייה", "Tikkun": u"תיקון", "Tikkunim": u"תיקונים" } words = dict(categories.items() + pseudo_categories.items() + section_names.items()) if s in words: return words[s] # If s is a text title, look for a stored Hebrew title try: from sefaria.model import get_index, IndexSet from sefaria.system.exceptions import BookNameError i = get_index(s) return i.get_title("he") except BookNameError: # Try looking in the commentator section of a Commentary2 text indexes = IndexSet({"title": {"$regex": "^" + s + " on "}}) for i in indexes: return i.toc_contents()["heCommentator"] return s