示例#1
0
def cache_many(pipe, docs):
    for doc in docs:
        try:
            logging.warn("Proccesing {doc}, approx left: {n}"
                         .format(n=len(docs), **locals()))
            pipeline(doc, pipe)
        except Exception:
            logging.exception("Error on processing {doc}".format(**locals()))

    logging.info("Done!")
示例#2
0
def test_pipeline_cache():
    "Does the cache work correctly?"
    import json
    import nltk
    from xtas.tasks.single import tokenize
    from xtas.tasks.pipeline import pipeline
    from xtas.tasks.es import es_document
    text = "The cat is happy"
    expected_tokens = [{
        u'token': u'The'
    }, {
        u'token': u'cat'
    }, {
        u'token': u'is'
    }, {
        u'token': u'happy'
    }]
    expected_pos = [[u'The', u'DT'], [u'cat', u'NN'], [u'is', u'VBZ'],
                    [u'happy', u'JJ']]
    with eager_celery(), clean_es() as es:
        idx, typ = ES_TEST_INDEX, ES_TEST_TYPE
        id = es.index(index=idx, doc_type=typ, body={"text": text})['_id']
        doc = es_document(idx, typ, id, "text")
        # test a single task 'pipeline'
        pipe = [{"module": tokenize}]
        r = pipeline(doc, pipe, store_intermediate=True)
        assert_equal(r, expected_tokens)
        # second time result should come from cache.
        # Test with block=False which returns async object if not cached
        r = pipeline(doc, pipe, store_intermediate=True, block=False)
        assert_equal(r, expected_tokens)
        # add pos_tag to pipeline. Check that tokenize is not called
        # (anyone has a more elegant way to check that?)
        pipe = [{
            "module": "xtas.tasks.single.tokenize"
        }, {
            "module": "xtas.tasks.single.pos_tag",
            "arguments": {
                "model": "nltk"
            }
        }]
        OLD_TOKENIZE = nltk.word_tokenize
        nltk.word_tokenize = None
        try:
            r = pipeline(doc, pipe, store_intermediate=True)
            # compare json to ignore tuple/list difference
            assert_equal(json.dumps(r), json.dumps(expected_pos))
        finally:
            nltk.word_tokenize = OLD_TOKENIZE
        # whole pipeline should now be skipped
        r = pipeline(doc, pipe, store_intermediate=True, block=False)
        assert_equal(json.dumps(r), json.dumps(expected_pos))
示例#3
0
def test_pipeline():
    from xtas.tasks.single import tokenize, pos_tag
    from xtas.tasks.pipeline import pipeline
    s = "cats are furry"
    expected = [('cats', 'NNS'), ('are', 'VBP'), ('furry', 'JJ')]
    result = pos_tag(tokenize(s), 'nltk')
    assert_equal(result, expected)
    with eager_celery():
        # do we get correct result from pipeline?
        r = pipeline(s, [{"module": tokenize},
                         {"module": pos_tag, "arguments": {"model": "nltk"}}])
        assert_equal(r, expected)
        # can we specify modules by name?
        r = pipeline(s, [{"module": "xtas.tasks.single.tokenize"},
                         {"module": "xtas.tasks.single.pos_tag",
                          "arguments": {"model": "nltk"}}])
        assert_equal(r, expected)
示例#4
0
def test_pipeline_cache():
    "Does the cache work correctly?"
    import json
    import nltk
    from xtas.tasks.single import tokenize
    from xtas.tasks.pipeline import pipeline
    from xtas.tasks.es import es_document

    text = "The cat is happy"
    expected_tokens = [u"The", u"cat", u"is", u"happy"]
    expected_pos = [[u"The", u"DT"], [u"cat", u"NN"], [u"is", u"VBZ"], [u"happy", u"JJ"]]
    with eager_celery(), clean_es() as es:
        idx, typ = ES_TEST_INDEX, ES_TEST_TYPE
        id = es.index(index=idx, doc_type=typ, body={"text": text})["_id"]
        doc = es_document(idx, typ, id, "text")
        # test a single task 'pipeline'
        pipe = [{"module": tokenize}]
        r = pipeline(doc, pipe, store_intermediate=True)
        assert_equal(r, expected_tokens)
        # second time result should come from cache.
        # Test with block=False which returns async object if not cached
        client.indices.IndicesClient(es).flush()
        r = pipeline(doc, pipe, store_intermediate=True, block=False)
        assert_equal(r, expected_tokens)
        # add pos_tag to pipeline. Check that tokenize is not called
        # (anyone has a more elegant way to check that?)
        pipe = [
            {"module": "xtas.tasks.single.tokenize"},
            {"module": "xtas.tasks.single.pos_tag", "arguments": {"model": "nltk"}},
        ]
        OLD_TOKENIZE = nltk.word_tokenize
        nltk.word_tokenize = None
        try:
            client.indices.IndicesClient(es).flush()
            r = pipeline(doc, pipe, store_intermediate=True)
            # compare json to ignore tuple/list difference
            assert_equal(json.dumps(r), json.dumps(expected_pos))
        finally:
            nltk.word_tokenize = OLD_TOKENIZE
        # whole pipeline should now be skipped
        client.indices.IndicesClient(es).flush()
        r = pipeline(doc, pipe, store_intermediate=True, block=False)
        assert_equal(json.dumps(r), json.dumps(expected_pos))
示例#5
0
def get_adhoc_result(analysis, text, store_intermediate=True):
    from xtas.tasks.es import adhoc_document
    from xtas.tasks.pipeline import pipeline

    analysis = _get_analysis(analysis)
    es = amcates.ES()
    doc = adhoc_document('adhoc', es.doc_type, 'text', text=text)

    print "Pipelining..."
    return pipeline(doc, analysis, store_intermediate=store_intermediate)
示例#6
0
def get_adhoc_result(analysis, text, store_intermediate=True):
    from xtas.tasks.es import adhoc_document
    from xtas.tasks.pipeline import pipeline

    analysis = _get_analysis(analysis)
    es = amcates.ES()
    doc = adhoc_document('adhoc', es.doc_type, 'text', text=text)

    print "Pipelining..."
    return pipeline(doc, analysis, store_intermediate=store_intermediate)
示例#7
0
def get_result(article, analysis, store_intermediate=True, block=True):
    from xtas.tasks.pipeline import pipeline
    if not isinstance(article, int): article = article.id
    analysis = _get_analysis(analysis)

    es = amcates.ES()
    doc = {'index': es.index, 'type': es.doc_type,
           'id': article, 'field': 'text'}
    r = pipeline(doc, analysis,
                 store_intermediate=store_intermediate)
    return r
示例#8
0
def test_pipeline():
    from xtas.tasks.single import tokenize, pos_tag
    from xtas.tasks.pipeline import pipeline

    s = "cats are furry"
    expected = [("cats", "NNS"), ("are", "VBP"), ("furry", "JJ")]
    result = pos_tag(tokenize(s), "nltk")
    assert_equal(result, expected)
    with eager_celery():
        # do we get correct result from pipeline?
        r = pipeline(s, [{"module": tokenize}, {"module": pos_tag, "arguments": {"model": "nltk"}}])
        assert_equal(r, expected)
        # can we specify modules by name?
        r = pipeline(
            s,
            [
                {"module": "xtas.tasks.single.tokenize"},
                {"module": "xtas.tasks.single.pos_tag", "arguments": {"model": "nltk"}},
            ],
        )
        assert_equal(r, expected)
示例#9
0
def get_result(article, analysis, store_intermediate=True, block=True):
    from xtas.tasks.pipeline import pipeline
    if not isinstance(article, int): article = article.id
    analysis = _get_analysis(analysis)

    es = amcates.ES()
    doc = {
        'index': es.index,
        'type': es.doc_type,
        'id': article,
        'field': 'text'
    }
    r = pipeline(doc, analysis, store_intermediate=store_intermediate)
    return r
示例#10
0
def get_result(article, analysis, store_intermediate=True, block=True):
    from xtas.tasks.pipeline import pipeline
    if not isinstance(article, int): article = article.id
    if not isinstance(analysis, list):
        if hasattr(ANALYSES, analysis):
            analysis = getattr(ANALYSES, analysis)
        elif "." in analysis:
            analysis = [{"module" : m} for m in analysis.split('__')]
        else:
            raise ValueError("Unknown analysis: {analysis}".format(**locals()))

    es = amcates.ES()
    doc = {'index': es.index, 'type': es.doc_type,
           'id': article, 'field': 'text'}
    r = pipeline(doc, analysis,
                 store_intermediate=store_intermediate)
    return r
示例#11
0
def get_result(article, analysis, store_intermediate=True, block=True):
    from xtas.tasks.pipeline import pipeline
    if not isinstance(article, int): article = article.id
    if not isinstance(analysis, list):
        if hasattr(ANALYSES, analysis):
            analysis = getattr(ANALYSES, analysis)
        elif "." in analysis:
            analysis = [{"module": m} for m in analysis.split('__')]
        else:
            raise ValueError("Unknown analysis: {analysis}".format(**locals()))

    es = amcates.ES()
    doc = {
        'index': es.index,
        'type': es.doc_type,
        'id': article,
        'field': 'text'
    }
    r = pipeline(doc, analysis, store_intermediate=store_intermediate)
    return r
示例#12
0
import sys
import json
from os import path

from xtas.tasks.pipeline import pipeline
from xtas.tasks import app

import syntaxrules
from syntaxrules.sources import get_all_sources_nl

s = "Dit gaat makkelijk volgens Wouter"

# (1) Parse the sentence
# we don't want to use the multiprocessing
app.conf['CELERY_ALWAYS_EAGER'] = True
saf = pipeline(s, [{"module": "xtas.tasks.single.alpino"}])

# (2) Run ruleset

# Option 1: shortcut with existing rules in sources_nl
saf['quotes'] = list(get_all_sources_nl(saf))

# Output: Print quotes as string
print "\nQuotes according to get_all_sources_nl:"
lemmata = {t['id'] : t['lemma'] for t in saf['tokens']}
for quote in saf['quotes']:
    source = ",".join(lemmata[s] for s in quote['source'])
    quote = ",".join(lemmata[s] for s in quote['quote'])
    print source, ":", quote

# Output: Write 'enriched' SAF to file