Exemplo n.º 1
0
def test_store_get_result():
    "test whether results can be stored and retrieved"
    from xtas.tasks.es import store_single, get_single_result, get_all_results
    idx, typ = ES_TEST_INDEX, ES_TEST_TYPE
    with clean_es() as es:
        id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id']
        assert_equal(get_single_result("task1", idx, typ, id), None)
        assert_equal(get_all_results(idx, typ, id), {})

        store_single("task1_result", "task1", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result"})

        # test second result and test non-scalar data
        task2_result = {"a": {"b": ["c", "d"]}}
        store_single(task2_result, "task2", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)
        assert_equal(get_all_results(idx, typ, id),
                     {"task1": "task1_result", "task2": task2_result})

        # store a task result under an existing task, check that it is replaced
        store_single("task1_result2", "task1", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result2")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)
        assert_equal(get_all_results(idx, typ, id),
                     {"task1": "task1_result2", "task2": task2_result})

        # check that the original document is intact
        src = es.get_source(index=idx, doc_type=typ, id=id)
        assert_equal(src['text'], "test")
Exemplo n.º 2
0
def test_store_get_result():
    "test whether results can be stored and retrieved"
    from xtas.tasks.es import store_single, get_single_result, get_all_results
    idx, typ = ES_TEST_INDEX, ES_TEST_TYPE
    with clean_es() as es:
        id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id']
        assert_equal(get_single_result("task1", idx, typ, id), None)
        assert_equal(get_all_results(idx, typ, id), {})

        r = store_single("task1_result",
                         "task1",
                         idx,
                         typ,
                         id,
                         return_data=False)
        assert_equal(r, None)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result"})

        # test second result and test non-scalar data
        task2_result = {"a": {"b": ["c", "d"]}}
        store_single(task2_result, "task2", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)
        assert_equal(get_all_results(idx, typ, id), {
            "task1": "task1_result",
            "task2": task2_result
        })

        # store a task result under an existing task, check that it is replaced
        store_single("task1_result2", "task1", idx, typ, id)
        client.indices.IndicesClient(es).flush()
        assert_equal(get_single_result("task1", idx, typ, id), "task1_result2")
        assert_equal(get_single_result("task2", idx, typ, id), task2_result)
        assert_equal(get_all_results(idx, typ, id), {
            "task1": "task1_result2",
            "task2": task2_result
        })

        # check that the original document is intact
        src = es.get_source(index=idx, doc_type=typ, id=id)
        assert_equal(src['text'], "test")
Exemplo n.º 3
0
def pipeline(doc, pipeline, store_final=True, store_intermediate=False,
             block=True):
    """
    Get the result for a given document.
    Pipeline should be a list of dicts, with members task and argument
    e.g. [{"module" : "tokenize"},
          {"module" : "pos_tag", "arguments" : {"model" : "nltk"}}]
    @param block: if True, it will block and return the actual result.
                  If False, it will return an AsyncResult unless the result was
                  cached, in which case it returns the result immediately (!)
    @param store_final: if True, store the final result
    @param store_intermediate: if True, store all intermediate results as well
    """
    # form basic pipeline by resolving task dictionaries to task objects
    tasks = [_get_task(t) for t in pipeline]

    if isinstance(doc, dict) and set(doc.keys()) == set(_ES_DOC_FIELDS):
        idx, typ, id, field = [doc[k] for k in _ES_DOC_FIELDS]
        chain = []
        input = None
        cache = get_all_results(idx, typ, id)
        # Check cache for existing documents
        # Iterate over tasks in reverse order, check cached result, and
        # otherwise prepend task (and cache store command) to chain
        for i in range(len(tasks), 0, -1):
            taskname = "__".join(t.task for t in tasks[:i])
            if taskname in cache:
                input = cache[taskname]
                break
            if (i == len(tasks) and store_final) or store_intermediate:
                chain.insert(0, store_single.s(taskname, idx, typ, id))
            chain.insert(0, tasks[i-1])
        if not chain:  # final result was cached, good!
            return input
        elif input is None:
            input = fetch(doc)
    else:
        # the doc is a string, so we can't use caching
        chain = tasks
        input = doc

    chain = celery.chain(*chain).delay(input)
    if block:
        return chain.get()
    else:
        return chain