def generate_texts_sitemaps(): """ Create sitemap for each text section for which content is available. Returns the number of files written (each sitemap can have only 50k URLs) """ refs = generate_refs_list() urls = ["http://www.sefaria.org/" + url_ref(ref) for ref in refs if url_ref(ref)] maps = list(chunks(urls, 40000)) for n in range(len(maps)): write_urls(maps[n], "texts-sitemap%d.txt" % n) return len(maps)
def generate_texts_sitemaps(): """ Create sitemap for each text section for which content is available. Returns the number of files written (each sitemap can have only 50k URLs) """ refs = generate_refs_list() urls = ["http://www.sefaria.org/" + model.Ref(tref).url() for tref in refs] maps = list(chunks(urls, 40000)) for n in range(len(maps)): write_urls(maps[n], "texts-sitemap%d.txt" % n) return len(maps)
def index_all_sections(skip=0): """ Step through refs of all sections of available text and index each. """ global doc_count doc_count = 0 refs = counts.generate_refs_list() if skip: refs = refs[skip:] for i in range(skip, len(refs)): index_text(refs[i]) if i % 200 == 0: print "Indexed Ref #%d" % i print "Indexed %d documents." % doc_count
def index_all_sections(skip=0): """ Step through refs of all sections of available text and index each. """ global doc_count doc_count = 0 refs = counts.generate_refs_list() print "Beginning index of %d refs." % len(refs) if skip: refs = refs[skip:] for i in range(skip, len(refs)): index_text(refs[i]) if i % 200 == 0: print "Indexed Ref #%d" % i print "Indexed %d documents." % doc_count
def count_terms(query={}, lang=None): #todo: move to object model. Maybe. What's this doing? """ Counts all terms in texts matching query, lang Saves reults to terms collection in db. """ terms = {} bavli_names = db.index.find(query).distinct("title") query = {"title": {"$in": bavli_names}} refs = counts.generate_refs_list( query) #library.ref_list() needs query argument lookup_lang = "he" if lang == "ar" else lang for ref in refs: print ref #text = texts.get_text(ref, commentary=False) text = TextFamily(Ref(ref), commentary=False).contents() for i, line in enumerate(text.get(lookup_lang, [])): # strip punctuation for c in string.punctuation: line = line.replace(c, "") these_terms = line.split(" ") for term in these_terms: line_ref = "%s:%d" % (ref, i + 1) term = hebrew.strip_nikkud(term) if term in terms: terms[term]["occurrences"] += 1 terms[term]["refs"].add(line_ref) else: terms[term] = { "term": term, "occurrences": 1, "language": lang, "refs": set([line_ref]) } for term in terms: print term # only include up to 20 random ref samples sample_size = len( terms[term]["refs"]) if len(terms[term]["refs"]) < 20 else 20 terms[term]["refs"] = list(sample(terms[term]["refs"], sample_size)) db.terms.save(terms[term])
def count_terms(query={}, lang=None): #todo: move to object model. Maybe. What's this doing? """ Counts all terms in texts matching query, lang Saves reults to terms collection in db. """ terms = {} bavli_names = db.index.find(query).distinct("title") query = {"title": {"$in": bavli_names}} refs = counts.generate_refs_list(query) #library.ref_list() needs query argument lookup_lang = "he" if lang == "ar" else lang for ref in refs: print ref #text = texts.get_text(ref, commentary=False) text = TextFamily(Ref(ref), commentary=False).contents() for i, line in enumerate(text.get(lookup_lang, [])): # strip punctuation for c in string.punctuation: line = line.replace(c,"") these_terms = line.split(" ") for term in these_terms: line_ref = "%s:%d" % (ref, i+1) term = hebrew.strip_nikkud(term) if term in terms: terms[term]["occurrences"] += 1 terms[term]["refs"].add(line_ref) else: terms[term] = { "term": term, "occurrences": 1, "language": lang, "refs": set([line_ref]) } for term in terms: print term # only include up to 20 random ref samples sample_size = len(terms[term]["refs"]) if len(terms[term]["refs"]) < 20 else 20 terms[term]["refs"] = list(sample(terms[term]["refs"], sample_size)) db.terms.save(terms[term])