def cache_many(pipe, docs): for doc in docs: try: logging.warn("Proccesing {doc}, approx left: {n}" .format(n=len(docs), **locals())) pipeline(doc, pipe) except Exception: logging.exception("Error on processing {doc}".format(**locals())) logging.info("Done!")
def test_pipeline_cache(): "Does the cache work correctly?" import json import nltk from xtas.tasks.single import tokenize from xtas.tasks.pipeline import pipeline from xtas.tasks.es import es_document text = "The cat is happy" expected_tokens = [{ u'token': u'The' }, { u'token': u'cat' }, { u'token': u'is' }, { u'token': u'happy' }] expected_pos = [[u'The', u'DT'], [u'cat', u'NN'], [u'is', u'VBZ'], [u'happy', u'JJ']] with eager_celery(), clean_es() as es: idx, typ = ES_TEST_INDEX, ES_TEST_TYPE id = es.index(index=idx, doc_type=typ, body={"text": text})['_id'] doc = es_document(idx, typ, id, "text") # test a single task 'pipeline' pipe = [{"module": tokenize}] r = pipeline(doc, pipe, store_intermediate=True) assert_equal(r, expected_tokens) # second time result should come from cache. # Test with block=False which returns async object if not cached r = pipeline(doc, pipe, store_intermediate=True, block=False) assert_equal(r, expected_tokens) # add pos_tag to pipeline. Check that tokenize is not called # (anyone has a more elegant way to check that?) pipe = [{ "module": "xtas.tasks.single.tokenize" }, { "module": "xtas.tasks.single.pos_tag", "arguments": { "model": "nltk" } }] OLD_TOKENIZE = nltk.word_tokenize nltk.word_tokenize = None try: r = pipeline(doc, pipe, store_intermediate=True) # compare json to ignore tuple/list difference assert_equal(json.dumps(r), json.dumps(expected_pos)) finally: nltk.word_tokenize = OLD_TOKENIZE # whole pipeline should now be skipped r = pipeline(doc, pipe, store_intermediate=True, block=False) assert_equal(json.dumps(r), json.dumps(expected_pos))
def test_pipeline(): from xtas.tasks.single import tokenize, pos_tag from xtas.tasks.pipeline import pipeline s = "cats are furry" expected = [('cats', 'NNS'), ('are', 'VBP'), ('furry', 'JJ')] result = pos_tag(tokenize(s), 'nltk') assert_equal(result, expected) with eager_celery(): # do we get correct result from pipeline? r = pipeline(s, [{"module": tokenize}, {"module": pos_tag, "arguments": {"model": "nltk"}}]) assert_equal(r, expected) # can we specify modules by name? r = pipeline(s, [{"module": "xtas.tasks.single.tokenize"}, {"module": "xtas.tasks.single.pos_tag", "arguments": {"model": "nltk"}}]) assert_equal(r, expected)
def test_pipeline_cache(): "Does the cache work correctly?" import json import nltk from xtas.tasks.single import tokenize from xtas.tasks.pipeline import pipeline from xtas.tasks.es import es_document text = "The cat is happy" expected_tokens = [u"The", u"cat", u"is", u"happy"] expected_pos = [[u"The", u"DT"], [u"cat", u"NN"], [u"is", u"VBZ"], [u"happy", u"JJ"]] with eager_celery(), clean_es() as es: idx, typ = ES_TEST_INDEX, ES_TEST_TYPE id = es.index(index=idx, doc_type=typ, body={"text": text})["_id"] doc = es_document(idx, typ, id, "text") # test a single task 'pipeline' pipe = [{"module": tokenize}] r = pipeline(doc, pipe, store_intermediate=True) assert_equal(r, expected_tokens) # second time result should come from cache. # Test with block=False which returns async object if not cached client.indices.IndicesClient(es).flush() r = pipeline(doc, pipe, store_intermediate=True, block=False) assert_equal(r, expected_tokens) # add pos_tag to pipeline. Check that tokenize is not called # (anyone has a more elegant way to check that?) pipe = [ {"module": "xtas.tasks.single.tokenize"}, {"module": "xtas.tasks.single.pos_tag", "arguments": {"model": "nltk"}}, ] OLD_TOKENIZE = nltk.word_tokenize nltk.word_tokenize = None try: client.indices.IndicesClient(es).flush() r = pipeline(doc, pipe, store_intermediate=True) # compare json to ignore tuple/list difference assert_equal(json.dumps(r), json.dumps(expected_pos)) finally: nltk.word_tokenize = OLD_TOKENIZE # whole pipeline should now be skipped client.indices.IndicesClient(es).flush() r = pipeline(doc, pipe, store_intermediate=True, block=False) assert_equal(json.dumps(r), json.dumps(expected_pos))
def get_adhoc_result(analysis, text, store_intermediate=True): from xtas.tasks.es import adhoc_document from xtas.tasks.pipeline import pipeline analysis = _get_analysis(analysis) es = amcates.ES() doc = adhoc_document('adhoc', es.doc_type, 'text', text=text) print "Pipelining..." return pipeline(doc, analysis, store_intermediate=store_intermediate)
def get_result(article, analysis, store_intermediate=True, block=True): from xtas.tasks.pipeline import pipeline if not isinstance(article, int): article = article.id analysis = _get_analysis(analysis) es = amcates.ES() doc = {'index': es.index, 'type': es.doc_type, 'id': article, 'field': 'text'} r = pipeline(doc, analysis, store_intermediate=store_intermediate) return r
def test_pipeline(): from xtas.tasks.single import tokenize, pos_tag from xtas.tasks.pipeline import pipeline s = "cats are furry" expected = [("cats", "NNS"), ("are", "VBP"), ("furry", "JJ")] result = pos_tag(tokenize(s), "nltk") assert_equal(result, expected) with eager_celery(): # do we get correct result from pipeline? r = pipeline(s, [{"module": tokenize}, {"module": pos_tag, "arguments": {"model": "nltk"}}]) assert_equal(r, expected) # can we specify modules by name? r = pipeline( s, [ {"module": "xtas.tasks.single.tokenize"}, {"module": "xtas.tasks.single.pos_tag", "arguments": {"model": "nltk"}}, ], ) assert_equal(r, expected)
def get_result(article, analysis, store_intermediate=True, block=True): from xtas.tasks.pipeline import pipeline if not isinstance(article, int): article = article.id analysis = _get_analysis(analysis) es = amcates.ES() doc = { 'index': es.index, 'type': es.doc_type, 'id': article, 'field': 'text' } r = pipeline(doc, analysis, store_intermediate=store_intermediate) return r
def get_result(article, analysis, store_intermediate=True, block=True): from xtas.tasks.pipeline import pipeline if not isinstance(article, int): article = article.id if not isinstance(analysis, list): if hasattr(ANALYSES, analysis): analysis = getattr(ANALYSES, analysis) elif "." in analysis: analysis = [{"module" : m} for m in analysis.split('__')] else: raise ValueError("Unknown analysis: {analysis}".format(**locals())) es = amcates.ES() doc = {'index': es.index, 'type': es.doc_type, 'id': article, 'field': 'text'} r = pipeline(doc, analysis, store_intermediate=store_intermediate) return r
def get_result(article, analysis, store_intermediate=True, block=True): from xtas.tasks.pipeline import pipeline if not isinstance(article, int): article = article.id if not isinstance(analysis, list): if hasattr(ANALYSES, analysis): analysis = getattr(ANALYSES, analysis) elif "." in analysis: analysis = [{"module": m} for m in analysis.split('__')] else: raise ValueError("Unknown analysis: {analysis}".format(**locals())) es = amcates.ES() doc = { 'index': es.index, 'type': es.doc_type, 'id': article, 'field': 'text' } r = pipeline(doc, analysis, store_intermediate=store_intermediate) return r
import sys import json from os import path from xtas.tasks.pipeline import pipeline from xtas.tasks import app import syntaxrules from syntaxrules.sources import get_all_sources_nl s = "Dit gaat makkelijk volgens Wouter" # (1) Parse the sentence # we don't want to use the multiprocessing app.conf['CELERY_ALWAYS_EAGER'] = True saf = pipeline(s, [{"module": "xtas.tasks.single.alpino"}]) # (2) Run ruleset # Option 1: shortcut with existing rules in sources_nl saf['quotes'] = list(get_all_sources_nl(saf)) # Output: Print quotes as string print "\nQuotes according to get_all_sources_nl:" lemmata = {t['id'] : t['lemma'] for t in saf['tokens']} for quote in saf['quotes']: source = ",".join(lemmata[s] for s in quote['source']) quote = ",".join(lemmata[s] for s in quote['quote']) print source, ":", quote # Output: Write 'enriched' SAF to file