def test_fetch(): "Test whether tasks.fetch works as documented" from xtas.tasks.es import fetch, es_document # if doc is a string, fetching should return the string assert_equal(fetch("Literal string"), "Literal string") # index a document and fetch it with an es_document with clean_es() as es: d = es.index(index=ES_TEST_INDEX, doc_type=ES_TEST_TYPE, body={"text": "test"}) doc = es_document(ES_TEST_INDEX, ES_TEST_TYPE, d['_id'], "text") assert_equal(fetch(doc), "test")
def test_pipeline_cache(): "Does the cache work correctly?" import json import nltk from xtas.tasks.single import tokenize from xtas.tasks.pipeline import pipeline from xtas.tasks.es import es_document text = "The cat is happy" expected_tokens = [{ u'token': u'The' }, { u'token': u'cat' }, { u'token': u'is' }, { u'token': u'happy' }] expected_pos = [[u'The', u'DT'], [u'cat', u'NN'], [u'is', u'VBZ'], [u'happy', u'JJ']] with eager_celery(), clean_es() as es: idx, typ = ES_TEST_INDEX, ES_TEST_TYPE id = es.index(index=idx, doc_type=typ, body={"text": text})['_id'] doc = es_document(idx, typ, id, "text") # test a single task 'pipeline' pipe = [{"module": tokenize}] r = pipeline(doc, pipe, store_intermediate=True) assert_equal(r, expected_tokens) # second time result should come from cache. # Test with block=False which returns async object if not cached r = pipeline(doc, pipe, store_intermediate=True, block=False) assert_equal(r, expected_tokens) # add pos_tag to pipeline. Check that tokenize is not called # (anyone has a more elegant way to check that?) pipe = [{ "module": "xtas.tasks.single.tokenize" }, { "module": "xtas.tasks.single.pos_tag", "arguments": { "model": "nltk" } }] OLD_TOKENIZE = nltk.word_tokenize nltk.word_tokenize = None try: r = pipeline(doc, pipe, store_intermediate=True) # compare json to ignore tuple/list difference assert_equal(json.dumps(r), json.dumps(expected_pos)) finally: nltk.word_tokenize = OLD_TOKENIZE # whole pipeline should now be skipped r = pipeline(doc, pipe, store_intermediate=True, block=False) assert_equal(json.dumps(r), json.dumps(expected_pos))
def test_pipeline_cache(): "Does the cache work correctly?" import json import nltk from xtas.tasks.single import tokenize from xtas.tasks.pipeline import pipeline from xtas.tasks.es import es_document text = "The cat is happy" expected_tokens = [u"The", u"cat", u"is", u"happy"] expected_pos = [[u"The", u"DT"], [u"cat", u"NN"], [u"is", u"VBZ"], [u"happy", u"JJ"]] with eager_celery(), clean_es() as es: idx, typ = ES_TEST_INDEX, ES_TEST_TYPE id = es.index(index=idx, doc_type=typ, body={"text": text})["_id"] doc = es_document(idx, typ, id, "text") # test a single task 'pipeline' pipe = [{"module": tokenize}] r = pipeline(doc, pipe, store_intermediate=True) assert_equal(r, expected_tokens) # second time result should come from cache. # Test with block=False which returns async object if not cached client.indices.IndicesClient(es).flush() r = pipeline(doc, pipe, store_intermediate=True, block=False) assert_equal(r, expected_tokens) # add pos_tag to pipeline. Check that tokenize is not called # (anyone has a more elegant way to check that?) pipe = [ {"module": "xtas.tasks.single.tokenize"}, {"module": "xtas.tasks.single.pos_tag", "arguments": {"model": "nltk"}}, ] OLD_TOKENIZE = nltk.word_tokenize nltk.word_tokenize = None try: client.indices.IndicesClient(es).flush() r = pipeline(doc, pipe, store_intermediate=True) # compare json to ignore tuple/list difference assert_equal(json.dumps(r), json.dumps(expected_pos)) finally: nltk.word_tokenize = OLD_TOKENIZE # whole pipeline should now be skipped client.indices.IndicesClient(es).flush() r = pipeline(doc, pipe, store_intermediate=True, block=False) assert_equal(json.dumps(r), json.dumps(expected_pos))
# Simple example of how to construct a pipeline of operations from Python # and apply them to an ES document, storing the result back into ES. from celery import chain from xtas.tasks.es import es_document, store_single from xtas.tasks.single import pos_tag, tokenize doc = es_document('blog', 'post', 1, 'body') # The following is Celery syntax for a pipeline of operations. ch = chain(tokenize.s(doc) | pos_tag.s('nltk') | store_single.s('pipeline', 'blog', 'post', 1) ) r = ch.delay() print(r.get())
from xtas.celery import app app.conf['CELERY_ALWAYS_EAGER'] = True pipe = [{"module" : x} for x in args.modules] doctype = "__".join([args.parent_doctype] + [m['module'] for m in pipe]) es = Elasticsearch(hosts=[{"host":args.host, "port": 9200}], timeout=600) check_mapping(es, args.index, doctype, args.parent_doctype) while True: if args.single: n, aids = 1, [args.set] else: logging.warn("Retrieving {args.n} articles".format(**locals())) try: n, aids = list(get_articles(es, args.index, doctype, args.parent_doctype, args.set, size=args.n)) except: logging.exception("Error on get_articles, retrying in 10 seconds") time.sleep(10) continue if not aids: logging.warn("DONE") break docs = [es_document(args.index, args.parent_doctype, aid, "text") for aid in aids] cache_many(pipe, docs) if args.norepeat or args.single: break