Пример #1
0
def flatten_toc(toc, include_categories=False, categories_in_titles=False, version_granularity=False):
    """
    Returns an array of strings which corresponds to each category and text in the
    Table of Contents in order.

    - categorie_in_titles: whether to include each category preceding a text title,
        e.g., "Tanach > Torah > Genesis".
    - version_granularity: whether to include a seperate entry for every text version.
    """
    results = []
    for x in toc:
        name = x.get("category", None) or x.get("title", None)
        if "category" in x:
            if include_categories:
                results += [name]
            subcats = flatten_toc(x["contents"], categories_in_titles=categories_in_titles)
            if categories_in_titles:
                subcats = ["%s > %s" %(name, y) for y in subcats]
            results += subcats

        elif "title" in x:
            if not version_granularity:
                results += [name]
            else:
                versions = texts.get_version_list(name)
                for v in versions:
                    lang = {"he": "Hebrew", "en": "English"}[v["language"]]
                    results += ["%s > %s > %s.json" % (name, lang, v["versionTitle"])]

    return results
Пример #2
0
def index_text(tref, version=None, lang=None):
    """
    Index the text designated by ref.
    If no version and lang are given, this functon will be called for each availble version.
    Currently assumes ref is at section level. 
    """
    #tref = texts.norm_ref(unicode(tref))
    #todo: why the unicode()?
    tref = model.Ref(tref).normal()

    # Recall this function for each specific text version, if non provided
    if not (version and lang):
        for v in texts.get_version_list(tref):
            index_text(tref, version=v["versionTitle"], lang=v["language"])
        return

    # Index each segment of this document individually
    oref = model.Ref(tref).padded_ref()
    if len(oref.sections) < len(oref.index.sectionNames):
        text = texts.get_text(tref,
                              context=0,
                              commentary=False,
                              version=version,
                              lang=lang)
        if "error" in text:
            print text["error"]
        else:
            for i in range(max(len(text["text"]), len(text["he"]))):
                index_text("%s:%d" % (tref, i + 1))

    # Don't try to index docs with depth 3
    if len(oref.sections) < len(oref.index.sectionNames) - 1:
        return

    # Index this document as a whole
    doc = make_text_index_document(tref, version, lang)
    if doc:
        try:
            global doc_count
            if doc_count % 5000 == 0:
                print "[%d] Indexing %s / %s / %s" % (doc_count, tref, version,
                                                      lang)
            es.index('sefaria', 'text', doc,
                     make_text_doc_id(tref, version, lang))
            doc_count += 1
        except Exception, e:
            print "ERROR indexing %s / %s / %s" % (tref, version, lang)
            pprint(e)
Пример #3
0
def index_text(ref, version=None, lang=None):
    """
    Index the text designated by ref.
    If no version and lang are given, this functon will be called for each availble version.
    Currently assumes ref is at section level. 
    """
    ref = texts.norm_ref(unicode(ref))

    # Recall this function for each specific text version, if non provided
    if not (version and lang):
        for v in texts.get_version_list(ref):
            index_text(ref, version=v["versionTitle"], lang=v["language"])
        return

    # Index each segment of this document individually
    pRef = texts.parse_ref(ref)
    if len(pRef["sections"]) < len(pRef["sectionNames"]):
        text = texts.get_text(ref,
                              context=0,
                              commentary=False,
                              version=version,
                              lang=lang)
        if "error" in text:
            print text["error"]
        else:
            for i in range(max(len(text["text"]), len(text["he"]))):
                index_text("%s:%d" % (ref, i + 1))

    # Don't try to index docs with depth 3
    if len(pRef["sections"]) < len(pRef["sectionNames"]) - 1:
        return

    # Index this document as a whole
    doc = make_text_index_document(ref, version, lang)
    if doc:
        try:
            es.index(doc, 'sefaria', 'text',
                     make_text_doc_id(ref, version, lang))
            global doc_count
            doc_count += 1
        except Exception, e:
            print "Error indexing %s / %s / %s" % (ref, version, lang)
            print e
Пример #4
0
def index_text(tref, version=None, lang=None):
    """
    Index the text designated by ref.
    If no version and lang are given, this functon will be called for each availble version.
    Currently assumes ref is at section level. 
    """
    #tref = texts.norm_ref(unicode(tref))
    #todo: why the unicode()?
    tref = model.Ref(tref).normal()

    # Recall this function for each specific text version, if non provided
    if not (version and lang):
        for v in texts.get_version_list(tref):
            index_text(tref, version=v["versionTitle"], lang=v["language"])
        return

    # Index each segment of this document individually
    oref = model.Ref(tref).padded_ref()
    if len(oref.sections) < len(oref.index.sectionNames):
        text = texts.get_text(tref, context=0, commentary=False, version=version, lang=lang)
        if "error" in text:
            print text["error"]
        else:
            for i in range(max(len(text["text"]), len(text["he"]))):
                index_text("%s:%d" % (tref, i+1))

    # Don't try to index docs with depth 3
    if len(oref.sections) < len(oref.index.sectionNames) - 1:
        return

    # Index this document as a whole
    doc = make_text_index_document(tref, version, lang)
    if doc:
        try:
            global doc_count
            if doc_count % 5000 == 0:
                print "[%d] Indexing %s / %s / %s" % (doc_count, tref, version, lang)
            es.index('sefaria', 'text', doc, make_text_doc_id(tref, version, lang))
            doc_count += 1
        except Exception, e:
            print "ERROR indexing %s / %s / %s" % (tref, version, lang)
            pprint(e)
Пример #5
0
def index_text(ref, version=None, lang=None):
    """
    Index the text designated by ref.
    If no version and lang are given, this functon will be called for each availble version.
    Currently assumes ref is at section level. 
    """
    ref = texts.norm_ref(unicode(ref))

    # Recall this function for each specific text version, if non provided
    if not (version and lang):
        for v in texts.get_version_list(ref):
            index_text(ref, version=v["versionTitle"], lang=v["language"])
        return

    # Index each segment of this document individually
    pRef = texts.parse_ref(ref)
    if len(pRef["sections"]) < len(pRef["sectionNames"]):
        text = texts.get_text(ref, context=0, commentary=False, version=version, lang=lang)
        if "error" in text:
            print text["error"]
        else:
            for i in range(max(len(text["text"]), len(text["he"]))):
                index_text("%s:%d" % (ref, i+1))

    # Don't try to index docs with depth 3
    if len(pRef["sections"]) < len(pRef["sectionNames"]) - 1:
        return

    # Index this document as a whole
    doc = make_text_index_document(ref, version, lang)
    if doc:
        try:
            es.index(doc, 'sefaria', 'text', make_text_doc_id(ref, version, lang))
            global doc_count
            doc_count += 1
        except Exception, e:
            print "Error indexing %s / %s / %s" % (ref, version, lang)
            print e
Пример #6
0
def flatten_toc(toc,
                include_categories=False,
                categories_in_titles=False,
                version_granularity=False):
    """
    Returns an array of strings which corresponds to each category and text in the
    Table of Contents in order.

    - categorie_in_titles: whether to include each category preceding a text title,
        e.g., "Tanach > Torah > Genesis".
    - version_granularity: whether to include a seperate entry for every text version.
    """
    results = []
    for x in toc:
        name = x.get("category", None) or x.get("title", None)
        if "category" in x:
            if include_categories:
                results += [name]
            subcats = flatten_toc(x["contents"],
                                  categories_in_titles=categories_in_titles)
            if categories_in_titles:
                subcats = ["%s > %s" % (name, y) for y in subcats]
            results += subcats

        elif "title" in x:
            if not version_granularity:
                results += [name]
            else:
                versions = texts.get_version_list(name)
                for v in versions:
                    lang = {"he": "Hebrew", "en": "English"}[v["language"]]
                    results += [
                        "%s > %s > %s.json" % (name, lang, v["versionTitle"])
                    ]

    return results