Пример #1
0
def section_texts(texts, size):
    for source, text in enumerate(texts):
        tokens = space_tokenize(text)
        chunks = group_chunks(tokens, size)
        for chunk in chunks:
            start, stop = chunk[0].start, chunk[-1].stop
            yield Section(source, start, stop, text[start:stop])
Пример #2
0
def map(cls,
        texts,
        host=DEEPPAVLOV_HOST,
        port=DEEPPAVLOV_PORT,
        section_size=DEEPPAVLOV_SECTION,
        batch_size=DEEPPAVLOV_BATCH):
    texts = patch_texts(texts)
    sections = section_texts(texts, section_size)
    batches = group_chunks(sections, batch_size)
    sections = map_(cls, batches, host, port)
    groups = group_sections(sections)
    for group in groups:
        yield merge_markups(cls, group)
Пример #3
0
def queue_retry_(chunk):
    log('Retrying')
    connection = get_connection(host=WORKER_HOST)
    queue = get_queue(FAILED, connection=connection)

    ids = annotators_ids(queue.jobs)
    queue.empty()

    for annotator in ids:
        annotator_ids = ids[annotator]
        annotator_ids = log_progress(annotator_ids, prefix=annotator)
        chunks = group_chunks(annotator_ids, size=chunk)
        queue = get_queue(annotator, connection=connection)
        for chunk_ in chunks:
            enqueue(queue, task, chunk_)
Пример #4
0
def queue_insert_(annotators, offset, count, chunk):
    log('Annotators: %s; offset: %d, count: %d, chunk: %d',
        ', '.join(annotators), offset, count or -1, chunk)

    db = get_db(host=WORKER_HOST)
    ids = read_index(db[SOURCE], offset)
    ids = log_progress(ids, total=count)

    ids = head(ids, count)
    chunks = group_chunks(ids, size=chunk)

    connection = get_connection(host=WORKER_HOST)
    queues = dict(get_queues(annotators, connection))
    for chunk in chunks:
        for annotator in annotators:
            queue = queues[annotator]
            enqueue(queue, task, chunk)