def _get_bg_conc(corp: manatee.Corpus, user_id: int, q: Tuple[str, ...], subchash: Optional[str], samplesize: int, calc_from: int, minsize: int) -> Union[PyConc, EmptyConc]: """ arguments: calc_from - from which operation idx (inclusive) we have to calculate respective results """ cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) # let's create cache records of the operations we'll have to perform if calc_from < len(q): for i in range(calc_from, len(q)): cachefile, _ = cache_map.add_to_map(subchash, q[:i + 1], 0, calc_status=CalcStatus()) if os.path.isfile(cachefile): del_silent(cachefile) logging.getLogger(__name__).warning( f'Removed unbound conc. cache file {cachefile}') app = bgcalc.calc_backend_client(settings) app.send_task('conc_sync_calculate', (user_id, corp.corpname, getattr( corp, 'subcname', None), subchash, q, samplesize), time_limit=TASK_TIME_LIMIT) # for smaller concordances/corpora there is a chance the data # is ready in a few seconds - let's try this: conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q)) else: # return empty yet unfinished concordance to make the client watch the calculation return EmptyConc(corp, cache_map.cache_file_path(subchash, q))
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize): """ """ app = bgcalc.calc_backend_client(settings) ans = app.send_task('worker.conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize, TASK_TIME_LIMIT), time_limit=CONC_REGISTER_TASK_LIMIT) ans.get(timeout=CONC_REGISTER_WAIT_LIMIT) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q)) else: return EmptyConc(corp, cache_map.cache_file_path(subchash, q))
def _get_bg_conc(corp: AbstractKCorpus, user_id: int, q: Tuple[str, ...], subchash: Optional[str], samplesize: int, calc_from: int, minsize: int) -> Union[PyConc, InitialConc]: """ arguments: calc_from - from which operation idx (inclusive) we have to calculate respective results """ cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) status = cache_map.get_calc_status(subchash, q) if status and not status.finished: # the calc is already running, the client has to wait and check regularly return InitialConc(corp, status.cachefile) # let's create cache records of the operations we'll have to perform if calc_from < len(q): for i in range(calc_from, len(q)): status = cache_map.add_to_map(subchash, q[:i + 1], ConcCacheStatus(), overwrite=True) # the file cannot be valid as otherwise, calc_from would be higher if os.path.isfile(status.cachefile): del_silent(status.cachefile) logging.getLogger(__name__).warning( f'Removed unbound conc. cache file {status.cachefile}') worker = bgcalc.calc_backend_client(settings) worker.send_task( 'conc_sync_calculate', object.__class__, (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize), time_limit=TASK_TIME_LIMIT) # for smaller concordances/corpora there is a chance the data # is ready in a few seconds - let's try this: conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.readable_cache_path(subchash, q)) else: # return empty yet unfinished concordance to make the client watch the calculation return InitialConc(corp, cache_map.readable_cache_path(subchash, q))
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize): """ """ cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) status = cache_map.get_calc_status(subchash, q) if not status or status.error: worker = bgcalc.calc_backend_client(settings) ans = worker.send_task( 'conc_register', object.__class__, (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize, TASK_TIME_LIMIT), time_limit=CONC_REGISTER_TASK_LIMIT) ans.get(timeout=CONC_REGISTER_WAIT_LIMIT) conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.readable_cache_path(subchash, q)) else: return InitialConc(corp, cache_map.readable_cache_path(subchash, q))
def get_conc(corp, user_id, q: Tuple[str, ...] = None, fromp=0, pagesize=0, asnc=0, save=0, samplesize=0) -> Union[manatee.Concordance, EmptyConc]: """ Get/calculate a concordance. The function always tries to fetch as complete result as possible (related to the 'q' tuple) from cache. The rest is calculated in different ways depending on contents of 'q' and also on the 'asnc' argument (if 0 then the conc is always calculated synchronously and within the same process, if 1 then the calculation can involve a) background calculation based on Manatee's asynchronous/continuous concordance fetching or b) background calculation with no continuous data fetching (i.e. user waits and then the whole result is avail.). corp -- a respective manatee.Corpus object user_id -- database user ID q -- a tuple/list containing an extended query representation (e.g. ['aword,[] within <doc id="foo" />', 'p0 ...']) fromp -- a page offset pagesize -- a page size (in lines, related to 'fromp') asnc -- if 1 then KonText spawns an asynchronous process to calculate the concordance and will provide results as they are ready save -- specifies whether to use a caching mechanism samplesize -- ? """ if not q: return EmptyConc(corp=corp, finished=True) # complete bg calc. without continuous data fetching => must accept 0 if _should_be_bg_query(corp, q, asnc): minsize = 0 elif len( q ) > 1 or asnc == 0: # conc with additional ops. needs whole concordance minsize = -1 else: minsize = fromp * pagesize # happy case for a user subchash = getattr(corp, 'subchash', None) conc = EmptyConc(corp=corp, finished=True) # try to locate concordance in cache if save: calc_from, conc = find_cached_conc_base(corp, subchash, q, minsize) if calc_from == len(q): save = 0 if not conc and q[0][0] == 'R': # online sample q_copy = list(q) q_copy[0] = q[0][1:] q_copy = tuple(q_copy) t, c = find_cached_conc_base(corp, subchash, q_copy, -1) if c: fullsize = c.fullsize() # TODO fullsize ??? else: calc_from = 1 asnc = 0 # move mid-sized aligned corpora or large non-aligned corpora to background if _should_be_bg_query(corp, q, asnc): minsize = fromp * pagesize conc = _get_bg_conc(corp=corp, user_id=user_id, q=q, subchash=subchash, samplesize=samplesize, calc_from=calc_from, minsize=minsize) else: worker = GeneralWorker() if isinstance(conc, EmptyConc): calc_from = 1 # use Manatee asynchronous conc. calculation (= show 1st page once it's avail.) if asnc and len(q) == 1: conc = _get_async_conc(corp=corp, user_id=user_id, q=q, subchash=subchash, samplesize=samplesize, minsize=minsize) # do the calc here and return (OK for small to mid sized corpora without alignments) else: conc = _get_sync_conc(worker=worker, corp=corp, q=q, save=save, subchash=subchash, samplesize=samplesize) # save additional concordance actions to cache (e.g. sample) for act in range(calc_from, len(q)): command, args = q[act][0], q[act][1:] conc.exec_command(command, args) if command in 'gae': # user specific/volatile actions, cannot save save = 0 if save: cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping( corp) cachefile, stored_status = cache_map.add_to_map( subchash, q[:act + 1], conc.size(), calc_status=worker.create_new_calc_status()) if stored_status and not stored_status.finished: ready = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q[:act + 1], minsize=-1) if not ready: raise ConcCalculationStatusException( 'Wait for concordance operation failed') elif not stored_status: conc.save(cachefile) cache_map.update_calc_status(subchash, q[:act + 1], finished=True, concsize=conc.size()) return conc