示例#1
0
def generate_texts_sitemaps():
	"""
	Create sitemap for each text section for which content is available.
	Returns the number of files written (each sitemap can have only 50k URLs)
	"""
	refs = generate_refs_list()
	urls = ["http://www.sefaria.org/" + url_ref(ref) for ref in refs if url_ref(ref)]
	
	maps = list(chunks(urls, 40000))

	for n in range(len(maps)):
		write_urls(maps[n], "texts-sitemap%d.txt" % n)

	return len(maps)
示例#2
0
def generate_texts_sitemaps():
    """
	Create sitemap for each text section for which content is available.
	Returns the number of files written (each sitemap can have only 50k URLs)
	"""
    refs = generate_refs_list()
    urls = ["http://www.sefaria.org/" + model.Ref(tref).url() for tref in refs]

    maps = list(chunks(urls, 40000))

    for n in range(len(maps)):
        write_urls(maps[n], "texts-sitemap%d.txt" % n)

    return len(maps)
示例#3
0
def index_all_sections(skip=0):
    """
    Step through refs of all sections of available text and index each. 
    """
    global doc_count
    doc_count = 0

    refs = counts.generate_refs_list()

    if skip:
        refs = refs[skip:]

    for i in range(skip, len(refs)):
        index_text(refs[i])
        if i % 200 == 0:
            print "Indexed Ref #%d" % i

    print "Indexed %d documents." % doc_count
示例#4
0
def index_all_sections(skip=0):
    """
    Step through refs of all sections of available text and index each. 
    """
    global doc_count
    doc_count = 0

    refs = counts.generate_refs_list()
    print "Beginning index of %d refs." % len(refs)

    if skip:
        refs = refs[skip:]

    for i in range(skip, len(refs)):
        index_text(refs[i])
        if i % 200 == 0:
            print "Indexed Ref #%d" % i

    print "Indexed %d documents." % doc_count
示例#5
0
def count_terms(query={}, lang=None):
    #todo: move to object model.  Maybe.  What's this doing?
    """
    Counts all terms in texts matching query, lang
    Saves reults to terms collection in db.
    """
    terms = {}
    bavli_names = db.index.find(query).distinct("title")
    query = {"title": {"$in": bavli_names}}
    refs = counts.generate_refs_list(
        query)  #library.ref_list() needs query argument
    lookup_lang = "he" if lang == "ar" else lang

    for ref in refs:
        print ref
        #text = texts.get_text(ref, commentary=False)
        text = TextFamily(Ref(ref), commentary=False).contents()
        for i, line in enumerate(text.get(lookup_lang, [])):
            # strip punctuation
            for c in string.punctuation:
                line = line.replace(c, "")
            these_terms = line.split(" ")
            for term in these_terms:
                line_ref = "%s:%d" % (ref, i + 1)
                term = hebrew.strip_nikkud(term)
                if term in terms:
                    terms[term]["occurrences"] += 1
                    terms[term]["refs"].add(line_ref)
                else:
                    terms[term] = {
                        "term": term,
                        "occurrences": 1,
                        "language": lang,
                        "refs": set([line_ref])
                    }

    for term in terms:
        print term
        # only include up to 20 random ref samples
        sample_size = len(
            terms[term]["refs"]) if len(terms[term]["refs"]) < 20 else 20
        terms[term]["refs"] = list(sample(terms[term]["refs"], sample_size))
        db.terms.save(terms[term])
示例#6
0
def count_terms(query={}, lang=None):
    #todo: move to object model.  Maybe.  What's this doing?
    """
    Counts all terms in texts matching query, lang
    Saves reults to terms collection in db.
    """
    terms = {}
    bavli_names = db.index.find(query).distinct("title")
    query = {"title": {"$in": bavli_names}}
    refs = counts.generate_refs_list(query)  #library.ref_list() needs query argument
    lookup_lang = "he" if lang == "ar" else lang

    for ref in refs:
        print ref
        #text = texts.get_text(ref, commentary=False)
        text = TextFamily(Ref(ref), commentary=False).contents()
        for i, line in enumerate(text.get(lookup_lang, [])):
            # strip punctuation
            for c in string.punctuation:
                line = line.replace(c,"")
            these_terms = line.split(" ")
            for term in these_terms:
                line_ref = "%s:%d" % (ref, i+1)
                term = hebrew.strip_nikkud(term)
                if term in terms:
                    terms[term]["occurrences"] += 1
                    terms[term]["refs"].add(line_ref)
                else:
                    terms[term] = {
                        "term": term,
                        "occurrences": 1,
                        "language": lang,
                        "refs": set([line_ref])
                    }

    for term in terms:
        print term
        # only include up to 20 random ref samples
        sample_size = len(terms[term]["refs"]) if len(terms[term]["refs"]) < 20 else 20
        terms[term]["refs"] = list(sample(terms[term]["refs"], sample_size))
        db.terms.save(terms[term])