Пример #1
0
def test_fetch():
    "Test whether tasks.fetch works as documented"
    from xtas.tasks.es import fetch, es_document
    # if doc is a string, fetching should return the string
    assert_equal(fetch("Literal string"), "Literal string")
    # index a document and fetch it with an es_document
    with clean_es() as es:
        d = es.index(index=ES_TEST_INDEX, doc_type=ES_TEST_TYPE,
                     body={"text": "test"})
        doc = es_document(ES_TEST_INDEX, ES_TEST_TYPE, d['_id'], "text")
        assert_equal(fetch(doc), "test")
Пример #2
0
def test_fetch():
    "Test whether tasks.fetch works as documented"
    from xtas.tasks.es import fetch, es_document
    # if doc is a string, fetching should return the string
    assert_equal(fetch("Literal string"), "Literal string")
    # index a document and fetch it with an es_document
    with clean_es() as es:
        d = es.index(index=ES_TEST_INDEX,
                     doc_type=ES_TEST_TYPE,
                     body={"text": "test"})
        doc = es_document(ES_TEST_INDEX, ES_TEST_TYPE, d['_id'], "text")
        assert_equal(fetch(doc), "test")
Пример #3
0
def test_pipeline_cache():
    "Does the cache work correctly?"
    import json
    import nltk
    from xtas.tasks.single import tokenize
    from xtas.tasks.pipeline import pipeline
    from xtas.tasks.es import es_document
    text = "The cat is happy"
    expected_tokens = [{
        u'token': u'The'
    }, {
        u'token': u'cat'
    }, {
        u'token': u'is'
    }, {
        u'token': u'happy'
    }]
    expected_pos = [[u'The', u'DT'], [u'cat', u'NN'], [u'is', u'VBZ'],
                    [u'happy', u'JJ']]
    with eager_celery(), clean_es() as es:
        idx, typ = ES_TEST_INDEX, ES_TEST_TYPE
        id = es.index(index=idx, doc_type=typ, body={"text": text})['_id']
        doc = es_document(idx, typ, id, "text")
        # test a single task 'pipeline'
        pipe = [{"module": tokenize}]
        r = pipeline(doc, pipe, store_intermediate=True)
        assert_equal(r, expected_tokens)
        # second time result should come from cache.
        # Test with block=False which returns async object if not cached
        r = pipeline(doc, pipe, store_intermediate=True, block=False)
        assert_equal(r, expected_tokens)
        # add pos_tag to pipeline. Check that tokenize is not called
        # (anyone has a more elegant way to check that?)
        pipe = [{
            "module": "xtas.tasks.single.tokenize"
        }, {
            "module": "xtas.tasks.single.pos_tag",
            "arguments": {
                "model": "nltk"
            }
        }]
        OLD_TOKENIZE = nltk.word_tokenize
        nltk.word_tokenize = None
        try:
            r = pipeline(doc, pipe, store_intermediate=True)
            # compare json to ignore tuple/list difference
            assert_equal(json.dumps(r), json.dumps(expected_pos))
        finally:
            nltk.word_tokenize = OLD_TOKENIZE
        # whole pipeline should now be skipped
        r = pipeline(doc, pipe, store_intermediate=True, block=False)
        assert_equal(json.dumps(r), json.dumps(expected_pos))
Пример #4
0
def test_pipeline_cache():
    "Does the cache work correctly?"
    import json
    import nltk
    from xtas.tasks.single import tokenize
    from xtas.tasks.pipeline import pipeline
    from xtas.tasks.es import es_document

    text = "The cat is happy"
    expected_tokens = [u"The", u"cat", u"is", u"happy"]
    expected_pos = [[u"The", u"DT"], [u"cat", u"NN"], [u"is", u"VBZ"], [u"happy", u"JJ"]]
    with eager_celery(), clean_es() as es:
        idx, typ = ES_TEST_INDEX, ES_TEST_TYPE
        id = es.index(index=idx, doc_type=typ, body={"text": text})["_id"]
        doc = es_document(idx, typ, id, "text")
        # test a single task 'pipeline'
        pipe = [{"module": tokenize}]
        r = pipeline(doc, pipe, store_intermediate=True)
        assert_equal(r, expected_tokens)
        # second time result should come from cache.
        # Test with block=False which returns async object if not cached
        client.indices.IndicesClient(es).flush()
        r = pipeline(doc, pipe, store_intermediate=True, block=False)
        assert_equal(r, expected_tokens)
        # add pos_tag to pipeline. Check that tokenize is not called
        # (anyone has a more elegant way to check that?)
        pipe = [
            {"module": "xtas.tasks.single.tokenize"},
            {"module": "xtas.tasks.single.pos_tag", "arguments": {"model": "nltk"}},
        ]
        OLD_TOKENIZE = nltk.word_tokenize
        nltk.word_tokenize = None
        try:
            client.indices.IndicesClient(es).flush()
            r = pipeline(doc, pipe, store_intermediate=True)
            # compare json to ignore tuple/list difference
            assert_equal(json.dumps(r), json.dumps(expected_pos))
        finally:
            nltk.word_tokenize = OLD_TOKENIZE
        # whole pipeline should now be skipped
        client.indices.IndicesClient(es).flush()
        r = pipeline(doc, pipe, store_intermediate=True, block=False)
        assert_equal(json.dumps(r), json.dumps(expected_pos))
Пример #5
0
# Simple example of how to construct a pipeline of operations from Python
# and apply them to an ES document, storing the result back into ES.

from celery import chain
from xtas.tasks.es import es_document, store_single
from xtas.tasks.single import pos_tag, tokenize

doc = es_document('blog', 'post', 1, 'body')

# The following is Celery syntax for a pipeline of operations.
ch = chain(tokenize.s(doc)
           | pos_tag.s('nltk')
           | store_single.s('pipeline', 'blog', 'post', 1)
           )

r = ch.delay()
print(r.get())
Пример #6
0
    
    from xtas.celery import app
    app.conf['CELERY_ALWAYS_EAGER'] = True

    pipe = [{"module" : x} for x in args.modules]

    doctype = "__".join([args.parent_doctype] + [m['module'] for m in pipe])
    es = Elasticsearch(hosts=[{"host":args.host, "port": 9200}], timeout=600)
    check_mapping(es, args.index, doctype, args.parent_doctype)

    while True:
        if args.single:
            n, aids = 1, [args.set]
        else:
            logging.warn("Retrieving {args.n} articles".format(**locals()))
            try:
                n, aids = list(get_articles(es, args.index, doctype, args.parent_doctype, args.set, size=args.n))
            except:
                logging.exception("Error on get_articles, retrying in 10 seconds")
                time.sleep(10)
                continue
        if not aids:
            logging.warn("DONE")
            break
        docs = [es_document(args.index, args.parent_doctype, aid, "text")
                for aid in aids]
        cache_many(pipe, docs)

        if args.norepeat or args.single:
            break