def test_store_get_result(): "test whether results can be stored and retrieved" from xtas.tasks.es import store_single, get_single_result, get_all_results idx, typ = ES_TEST_INDEX, ES_TEST_TYPE with clean_es() as es: id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id'] assert_equal(get_single_result("task1", idx, typ, id), None) assert_equal(get_all_results(idx, typ, id), {}) store_single("task1_result", "task1", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result"}) # test second result and test non-scalar data task2_result = {"a": {"b": ["c", "d"]}} store_single(task2_result, "task2", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_equal(get_single_result("task2", idx, typ, id), task2_result) assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result", "task2": task2_result}) # store a task result under an existing task, check that it is replaced store_single("task1_result2", "task1", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result2") assert_equal(get_single_result("task2", idx, typ, id), task2_result) assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result2", "task2": task2_result}) # check that the original document is intact src = es.get_source(index=idx, doc_type=typ, id=id) assert_equal(src['text'], "test")
def pipeline(doc, pipeline, store_final=True, store_intermediate=False, block=True): """ Get the result for a given document. Pipeline should be a list of dicts, with members task and argument e.g. [{"module" : "tokenize"}, {"module" : "pos_tag", "arguments" : {"model" : "nltk"}}] @param block: if True, it will block and return the actual result. If False, it will return an AsyncResult unless the result was cached, in which case it returns the result immediately (!) @param store_final: if True, store the final result @param store_intermediate: if True, store all intermediate results as well """ # form basic pipeline by resolving task dictionaries to task objects tasks = [_get_task(t) for t in pipeline] if is_es_document(doc): idx, typ, id, field = es_address(doc) chain = [] input = None # Check cache for existing documents # Iterate over tasks in reverse order, check cached result, and # otherwise prepend task (and cache store command) to chain for i in range(len(tasks), 0, -1): taskname = "__".join(t.task for t in tasks[:i]) input = get_single_result(taskname, idx, typ, id) if input: break if (i == len(tasks) and store_final) or store_intermediate: chain.insert(0, store_single.s(taskname, idx, typ, id)) chain.insert(0, tasks[i - 1]) if not chain: # final result was cached, good! return input elif input is None: input = fetch(doc) else: # the doc is a string, so we can't use caching chain = tasks input = doc chain = celery.chain(*chain).delay(input) if block: return chain.get() else: return chain
def pipeline(doc, pipeline, store_final=True, store_intermediate=False, block=True): """ Get the result for a given document. Pipeline should be a list of dicts, with members task and argument e.g. [{"module" : "tokenize"}, {"module" : "pos_tag", "arguments" : {"model" : "nltk"}}] @param block: if True, it will block and return the actual result. If False, it will return an AsyncResult unless the result was cached, in which case it returns the result immediately (!) @param store_final: if True, store the final result @param store_intermediate: if True, store all intermediate results as well """ # form basic pipeline by resolving task dictionaries to task objects tasks = [_get_task(t) for t in pipeline] if is_es_document(doc): idx, typ, id, field = es_address(doc) chain = [] input = None # Check cache for existing documents # Iterate over tasks in reverse order, check cached result, and # otherwise prepend task (and cache store command) to chain for i in range(len(tasks), 0, -1): taskname = "__".join(t.task for t in tasks[:i]) input = get_single_result(taskname, idx, typ, id) if input: break if (i == len(tasks) and store_final) or store_intermediate: chain.insert(0, store_single.s(taskname, idx, typ, id)) chain.insert(0, tasks[i-1]) if not chain: # final result was cached, good! return input elif input is None: input = fetch(doc) else: # the doc is a string, so we can't use caching chain = tasks input = doc chain = celery.chain(*chain).delay(input) if block: return chain.get() else: return chain
def test_store_get_result(): "test whether results can be stored and retrieved" from xtas.tasks.es import ( store_single, get_single_result, get_tasks_per_index, fetch_documents_by_task, fetch_results_by_document, fetch_query_details_batch ) idx, typ = ES_TEST_INDEX, ES_TEST_TYPE with clean_es() as es: id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id'] assert_equal(get_single_result("task1", idx, typ, id), None) store_single("task1_result", "task1", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_in("task1", get_tasks_per_index(idx, typ)) # test second result and test non-scalar data task2_result = {"a": {"b": ["c", "d"]}} store_single(task2_result, "task2", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_equal(get_single_result("task2", idx, typ, id), task2_result) query = {"match": {"b": {"query": "c"}}} assert_equal(len(fetch_documents_by_task(idx, typ, query, "task2")), 1) query = {"match": {"text": {"query": "test"}}} results = fetch_results_by_document(idx, typ, query, "task2") assert_equal(len(results), 1) results = fetch_query_details_batch(idx, typ, query, True) assert_in("task1", results[0][1]) assert_in("task2", results[0][1]) results = fetch_query_details_batch(idx, typ, query, tasknames=["task2"]) assert_in("task2", results[0][1]) # store a task result under an existing task, check that it is replaced store_single("task1_result2", "task1", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result2") assert_equal(get_single_result("task2", idx, typ, id), task2_result) # check that the original document is intact src = es.get_source(index=idx, doc_type=typ, id=id) assert_equal(src['text'], "test")
def test_store_get_result(): "test whether results can be stored and retrieved" from xtas.tasks.es import store_single, get_single_result, get_all_results idx, typ = ES_TEST_INDEX, ES_TEST_TYPE with clean_es() as es: id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id'] assert_equal(get_single_result("task1", idx, typ, id), None) assert_equal(get_all_results(idx, typ, id), {}) r = store_single("task1_result", "task1", idx, typ, id, return_data=False) assert_equal(r, None) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_equal(get_all_results(idx, typ, id), {"task1": "task1_result"}) # test second result and test non-scalar data task2_result = {"a": {"b": ["c", "d"]}} store_single(task2_result, "task2", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_equal(get_single_result("task2", idx, typ, id), task2_result) assert_equal(get_all_results(idx, typ, id), { "task1": "task1_result", "task2": task2_result }) # store a task result under an existing task, check that it is replaced store_single("task1_result2", "task1", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result2") assert_equal(get_single_result("task2", idx, typ, id), task2_result) assert_equal(get_all_results(idx, typ, id), { "task1": "task1_result2", "task2": task2_result }) # check that the original document is intact src = es.get_source(index=idx, doc_type=typ, id=id) assert_equal(src['text'], "test")
def test_store_get_result(): "test whether results can be stored and retrieved" from xtas.tasks.es import (store_single, get_single_result, get_tasks_per_index, fetch_documents_by_task, fetch_results_by_document, fetch_query_details_batch) idx, typ = ES_TEST_INDEX, ES_TEST_TYPE with clean_es() as es: id = es.index(index=idx, doc_type=typ, body={"text": "test"})['_id'] assert_equal(get_single_result("task1", idx, typ, id), None) store_single("task1_result", "task1", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_in("task1", get_tasks_per_index(idx, typ)) # test second result and test non-scalar data task2_result = {"a": {"b": ["c", "d"]}} store_single(task2_result, "task2", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result") assert_equal(get_single_result("task2", idx, typ, id), task2_result) query = {"match": {"b": {"query": "c"}}} assert_equal(len(fetch_documents_by_task(idx, typ, query, "task2")), 1) query = {"match": {"text": {"query": "test"}}} results = fetch_results_by_document(idx, typ, query, "task2") assert_equal(len(results), 1) results = fetch_query_details_batch(idx, typ, query, True) assert_in("task1", results[0][1]) assert_in("task2", results[0][1]) results = fetch_query_details_batch(idx, typ, query, tasknames=["task2"]) assert_in("task2", results[0][1]) # store a task result under an existing task, check that it is replaced store_single("task1_result2", "task1", idx, typ, id) client.indices.IndicesClient(es).flush() assert_equal(get_single_result("task1", idx, typ, id), "task1_result2") assert_equal(get_single_result("task2", idx, typ, id), task2_result) # check that the original document is intact src = es.get_source(index=idx, doc_type=typ, id=id) assert_equal(src['text'], "test")