def export_links(): """ Creates a single CSV file containing all links known to Sefaria. """ with open(SEFARIA_DATA_PATH + "/links/links.csv", 'wb') as csvfile: writer = csv.writer(csvfile) writer.writerow([ "Citation 1", "Citation 2", "Conection Type", "Text 1", "Text 2", "Category 1", "Category 2", ]) links = db.links.find().sort([["refs.0", 1]]) for link in links: if random() > .99: print link["refs"][0] parsed1 = parse_ref(link["refs"][0]) parsed2 = parse_ref(link["refs"][1]) if "error" in parsed1 or "error" in parsed2: # Don't export bad links continue writer.writerow([ link["refs"][0], link["refs"][1], link["type"], parsed1["book"], parsed2["book"], parsed1["categories"][0], parsed2["categories"][0], ])
def count_texts(ref, lang=None): """ Count available versions of a text in the db, segment by segment """ counts = [] pref = sefaria.parse_ref(ref) if "error" in pref: return pref depth = len(pref["sectionNames"]) query = { "title": pref["book"] } if lang: query["language"] = lang texts = sefaria.db.texts.find(query) for text in texts: # TODO Look at the sections requested in ref, not just total book this_count = count_array(text["chapter"]) counts = sum_count_arrays(counts, this_count) result = { "counts": counts, "lengths": [], "sectionNames": pref["sectionNames"] } #result = dict(result.items() + pref.items() for d in range(depth): result["lengths"].append(sum_counts(counts, d)) return result
def remove_old_counts(): """ Deletes counts documents which no longer correspond to a text or category. """ counts = db.counts.find() for count in counts: if "title" in count: i = texts.parse_ref(count["title"]) if "error" in i: print "Old text %s" % count["title"] db.counts.remove(count) else: # TODO incomplete continue categories = counts["categories"] i = db.index.find( { "$and": [ {"categories.0": categories[0]}, {"categories": {"$all": categories}}, {"categories": {"$size": len(categories)}}, ] } ) if not i.count(): print "Old category %s" % " > ".join(categories)
def count_texts(ref, lang=None): """ Count available versions of a text in the db, segment by segment """ counts = [] pref = sefaria.parse_ref(ref) if "error" in pref: return pref depth = pref["textDepth"] query = {"title": pref["book"]} if lang: query["language"] = lang texts = sefaria.db.texts.find(query) for text in texts: # TODO Look at the sections requested in ref, not just total book this_count = count_array(text["chapter"]) counts = sum_count_arrays(counts, this_count) result = { "counts": counts, "lengths": [], "sectionNames": pref["sectionNames"] } #result = dict(result.items() + pref.items() for d in range(depth): result["lengths"].append(sum_counts(counts, d)) return result
def is_ref_available(ref, lang): """ Returns True if at least one complete version of ref is available in lang. """ p = texts.parse_ref(ref) if "error" in p: return False counts_doc = get_counts_doc(p["book"]) if not counts_doc: counts_doc = update_text_count(p["book"]) counts = counts_doc["availableTexts"][lang] segment = texts.grab_section_from_text(p["sections"], counts, toSections=p["toSections"]) if not isinstance(segment, list): segment = [segment] return all(segment)
def export_merged(title, lang=None): """ Exports a "merged" version of title, including the maximal text we have available in a single document. """ if not lang: print title for lang in ("he", "en"): export_merged(title, lang=lang) return doc = parse_ref(title, pad=False) if "error" in doc: return doc.update({ "title": title, "language": lang, "versionTitle": "merged", "versionSource": "http://www.sefaria.org/%s" % title.replace(" ", "_"), }) text_docs = db.texts.find({"title": title, "language": lang}) print "%d versions in %s" %(text_docs.count(), lang) if text_docs.count() == 0: return elif text_docs.count() == 1: text_doc = text_docs.next() doc["text"] = text_doc["chapter"] doc["versions"] = [(text_doc["versionTitle"], text_doc["versionSource"])] else: texts = [] sources = [] for text in text_docs: texts.append(text["chapter"]) sources.append((text["versionTitle"], text["versionSource"])) merged, merged_sources = merge_translations(texts, sources) merged_sources = list(set(merged_sources)) doc.update({ "text": merged, "versions": merged_sources, }) export_text_doc(doc)
def validate_review(review): for field in ("score", "comment", "ref", "language", "version"): if field not in review: return {"error": "Required field '%s' is missing from this review." % field} try: score = float(review["score"]) if score > 1 or score < 0: return {"error": "'score' must be a number between 0 and 1."} except TypeError: return {"error": "'score' must be a number between 0 and 1."} pRef = texts.parse_ref(review["ref"]) if "error" in pRef: return {"error": "Couldn't understand 'ref': %s" % pRef["error"]} return {"result": "ok"}
def index_text(ref, version=None, lang=None): """ Index the text designated by ref. If no version and lang are given, this functon will be called for each availble version. Currently assumes ref is at section level. """ ref = texts.norm_ref(unicode(ref)) # Recall this function for each specific text version, if non provided if not (version and lang): for v in texts.get_version_list(ref): index_text(ref, version=v["versionTitle"], lang=v["language"]) return # Index each segment of this document individually pRef = texts.parse_ref(ref) if len(pRef["sections"]) < len(pRef["sectionNames"]): text = texts.get_text(ref, context=0, commentary=False, version=version, lang=lang) if "error" in text: print text["error"] else: for i in range(max(len(text["text"]), len(text["he"]))): index_text("%s:%d" % (ref, i + 1)) # Don't try to index docs with depth 3 if len(pRef["sections"]) < len(pRef["sectionNames"]) - 1: return # Index this document as a whole doc = make_text_index_document(ref, version, lang) if doc: try: es.index(doc, 'sefaria', 'text', make_text_doc_id(ref, version, lang)) global doc_count doc_count += 1 except Exception, e: print "Error indexing %s / %s / %s" % (ref, version, lang) print e
def index_text(ref, version=None, lang=None): """ Index the text designated by ref. If no version and lang are given, this functon will be called for each availble version. Currently assumes ref is at section level. """ ref = texts.norm_ref(unicode(ref)) # Recall this function for each specific text version, if non provided if not (version and lang): for v in texts.get_version_list(ref): index_text(ref, version=v["versionTitle"], lang=v["language"]) return # Index each segment of this document individually pRef = texts.parse_ref(ref) if len(pRef["sections"]) < len(pRef["sectionNames"]): text = texts.get_text(ref, context=0, commentary=False, version=version, lang=lang) if "error" in text: print text["error"] else: for i in range(max(len(text["text"]), len(text["he"]))): index_text("%s:%d" % (ref, i+1)) # Don't try to index docs with depth 3 if len(pRef["sections"]) < len(pRef["sectionNames"]) - 1: return # Index this document as a whole doc = make_text_index_document(ref, version, lang) if doc: try: global doc_count if doc_count % 5000 == 0: print "[%d] Indexing %s / %s / %s" % (doc_count, ref, version, lang) es.index('sefaria', 'text', doc, make_text_doc_id(ref, version, lang)) doc_count += 1 except Exception, e: print "ERROR indexing %s / %s / %s" % (ref, version, lang) pprint(e)