def split_sections(texts, size): for source, text in enumerate(texts): tokens = tokenize(text) chunks = chop(tokens, size) for chunk in chunks: start, stop = chunk[0].start, chunk[-1].stop yield Section(source, start, stop, text[start:stop])
def map_deepavlov(items, host, port, batch_size=DEEPPAVLOV_BATCH, mode=DEEPPAVLOV): batches = chop(items, batch_size) for batch in batches: markups = call_deeppavlov(batch, host, port, mode) for markup in markups: yield markup
def map_deepavlov(texts, host, port, section_size=DEEPPAVLOV_SECTION, batch_size=DEEPPAVLOV_BATCH, mode=DEEPPAVLOV): texts = patch_texts(texts) sections = split_sections(texts, section_size) batches = chop(sections, batch_size) # group sections for speed sections = map_batches(batches, host, port, mode) # same sections with annotation groups = group_sections(sections) # group by text for group in groups: yield sections_markup(group)
def map_slovnet(items, host, port): chunks = chop(items, SLOVNET_CHUNK) for chunk in chunks: yield from call_slovnet(chunk, host, port)
def map_stanza(items, host, port, batch_size=STANZA_BATCH): batches = chop(items, batch_size) for batch in batches: markups = call_stanza(batch, host, port) for markup in markups: yield markup