Пример #1
0
def export_links():
	"""
	Creates a single CSV file containing all links known to Sefaria.
	"""
	with open(SEFARIA_DATA_PATH + "/links/links.csv", 'wb') as csvfile:
		writer = csv.writer(csvfile)
		writer.writerow([
							"Citation 1",
							"Citation 2",
							"Conection Type",
							"Text 1",
							"Text 2",
							"Category 1",
							"Category 2",							
						 ])
		links = db.links.find().sort([["refs.0", 1]])
		for link in links:
			if random() > .99:
				print link["refs"][0]
			parsed1 = parse_ref(link["refs"][0])
			parsed2 = parse_ref(link["refs"][1])

			if "error" in parsed1 or "error" in parsed2:
				# Don't export bad links
				continue

			writer.writerow([
							link["refs"][0],
							link["refs"][1],
							link["type"],
							parsed1["book"],
							parsed2["book"],
							parsed1["categories"][0],
							parsed2["categories"][0],
						 ])
Пример #2
0
def count_texts(ref, lang=None):
	"""
	Count available versions of a text in the db, segment by segment
	"""
	counts = []

	pref = sefaria.parse_ref(ref)
	if "error" in pref:
		return pref
	depth = len(pref["sectionNames"])

	query = { "title": pref["book"] }

	if lang:
		query["language"] = lang

	texts = sefaria.db.texts.find(query)
	for text in texts:
		# TODO Look at the sections requested in ref, not just total book
		this_count = count_array(text["chapter"])
		counts = sum_count_arrays(counts, this_count)

	result = { "counts": counts, "lengths": [], "sectionNames": pref["sectionNames"] }
	#result = dict(result.items() + pref.items()

	for d in range(depth):
		result["lengths"].append(sum_counts(counts, d))

	return result
Пример #3
0
def remove_old_counts():
    """
	Deletes counts documents which no longer correspond to a text or category.
	"""
    counts = db.counts.find()
    for count in counts:
        if "title" in count:
            i = texts.parse_ref(count["title"])
            if "error" in i:
                print "Old text %s" % count["title"]
                db.counts.remove(count)
        else:
            # TODO incomplete
            continue
            categories = counts["categories"]
            i = db.index.find(
                {
                    "$and": [
                        {"categories.0": categories[0]},
                        {"categories": {"$all": categories}},
                        {"categories": {"$size": len(categories)}},
                    ]
                }
            )
            if not i.count():
                print "Old category %s" % " > ".join(categories)
Пример #4
0
def count_texts(ref, lang=None):
    """
	Count available versions of a text in the db, segment by segment
	"""
    counts = []

    pref = sefaria.parse_ref(ref)
    if "error" in pref:
        return pref
    depth = pref["textDepth"]

    query = {"title": pref["book"]}

    if lang:
        query["language"] = lang

    texts = sefaria.db.texts.find(query)
    for text in texts:
        # TODO Look at the sections requested in ref, not just total book
        this_count = count_array(text["chapter"])
        counts = sum_count_arrays(counts, this_count)

    result = {
        "counts": counts,
        "lengths": [],
        "sectionNames": pref["sectionNames"]
    }
    #result = dict(result.items() + pref.items()

    for d in range(depth):
        result["lengths"].append(sum_counts(counts, d))

    return result
Пример #5
0
def is_ref_available(ref, lang):
	"""
	Returns True if at least one complete version of ref is available in lang.
	"""
	p = texts.parse_ref(ref)
	if "error" in p:
		return False
	counts_doc = get_counts_doc(p["book"])
	if not counts_doc:
		counts_doc = update_text_count(p["book"])
	counts = counts_doc["availableTexts"][lang]

	segment = texts.grab_section_from_text(p["sections"], counts, toSections=p["toSections"])

	if not isinstance(segment, list):
		segment = [segment]
	return all(segment)
Пример #6
0
def export_merged(title, lang=None):
	"""
	Exports a "merged" version of title, including the maximal text we have available
	in a single document. 
	"""
	if not lang:
		print title
		for lang in ("he", "en"):
			export_merged(title, lang=lang)
		return

	doc = parse_ref(title, pad=False)
	if "error" in doc:
		return
	doc.update({ 
		"title": title,
		"language": lang,
		"versionTitle": "merged",
		"versionSource": "http://www.sefaria.org/%s" % title.replace(" ", "_"),
	})
	text_docs = db.texts.find({"title": title, "language": lang})
	
	print "%d versions in %s" %(text_docs.count(), lang)

	if text_docs.count() == 0:
		return
	elif text_docs.count() == 1:
		text_doc = text_docs.next()
		doc["text"]          = text_doc["chapter"]
		doc["versions"]      = [(text_doc["versionTitle"], text_doc["versionSource"])]
	else:
		texts = []
		sources = []
		for text in text_docs:
			texts.append(text["chapter"])
			sources.append((text["versionTitle"], text["versionSource"]))

		merged, merged_sources = merge_translations(texts, sources)
		merged_sources = list(set(merged_sources))

		doc.update({ 
			"text": merged, 
			"versions": merged_sources,
 		})

	export_text_doc(doc)
Пример #7
0
def validate_review(review):
	
	for field in ("score", "comment", "ref", "language", "version"):
		if field not in review:
			return {"error": "Required field '%s' is missing from this review." % field}

	try:
		score = float(review["score"])
		if score > 1 or score < 0:
			return {"error": "'score' must be a number between 0 and 1."}
	except TypeError:
		return {"error": "'score' must be a number between 0 and 1."}

	pRef = texts.parse_ref(review["ref"])
	if "error" in pRef:
		return {"error": "Couldn't understand 'ref': %s" % pRef["error"]}

	return {"result": "ok"}
Пример #8
0
def index_text(ref, version=None, lang=None):
    """
    Index the text designated by ref.
    If no version and lang are given, this functon will be called for each availble version.
    Currently assumes ref is at section level. 
    """
    ref = texts.norm_ref(unicode(ref))

    # Recall this function for each specific text version, if non provided
    if not (version and lang):
        for v in texts.get_version_list(ref):
            index_text(ref, version=v["versionTitle"], lang=v["language"])
        return

    # Index each segment of this document individually
    pRef = texts.parse_ref(ref)
    if len(pRef["sections"]) < len(pRef["sectionNames"]):
        text = texts.get_text(ref,
                              context=0,
                              commentary=False,
                              version=version,
                              lang=lang)
        if "error" in text:
            print text["error"]
        else:
            for i in range(max(len(text["text"]), len(text["he"]))):
                index_text("%s:%d" % (ref, i + 1))

    # Don't try to index docs with depth 3
    if len(pRef["sections"]) < len(pRef["sectionNames"]) - 1:
        return

    # Index this document as a whole
    doc = make_text_index_document(ref, version, lang)
    if doc:
        try:
            es.index(doc, 'sefaria', 'text',
                     make_text_doc_id(ref, version, lang))
            global doc_count
            doc_count += 1
        except Exception, e:
            print "Error indexing %s / %s / %s" % (ref, version, lang)
            print e
Пример #9
0
def index_text(ref, version=None, lang=None):
    """
    Index the text designated by ref.
    If no version and lang are given, this functon will be called for each availble version.
    Currently assumes ref is at section level. 
    """
    ref = texts.norm_ref(unicode(ref))

    # Recall this function for each specific text version, if non provided
    if not (version and lang):
        for v in texts.get_version_list(ref):
            index_text(ref, version=v["versionTitle"], lang=v["language"])
        return

    # Index each segment of this document individually
    pRef = texts.parse_ref(ref)
    if len(pRef["sections"]) < len(pRef["sectionNames"]):
        text = texts.get_text(ref, context=0, commentary=False, version=version, lang=lang)
        if "error" in text:
            print text["error"]
        else:
            for i in range(max(len(text["text"]), len(text["he"]))):
                index_text("%s:%d" % (ref, i+1))

    # Don't try to index docs with depth 3
    if len(pRef["sections"]) < len(pRef["sectionNames"]) - 1:
        return

    # Index this document as a whole
    doc = make_text_index_document(ref, version, lang)
    if doc:
        try:
            global doc_count
            if doc_count % 5000 == 0:
                print "[%d] Indexing %s / %s / %s" % (doc_count, ref, version, lang)
            es.index('sefaria', 'text', doc, make_text_doc_id(ref, version, lang))
            doc_count += 1
        except Exception, e:
            print "ERROR indexing %s / %s / %s" % (ref, version, lang)
            pprint(e)