def _get_freq_dispersion(self, conc: PyConc, resolution: int) -> List[FreqDispersionBin]: conc_begs, values = conc.xdistribution([0] * resolution, 101) abs_freq = [] last_valid_item = None for beg in reversed(conc_begs): # if beg is 0, it means there are no concordances in the bin if beg > 0: if last_valid_item is None: abs_freq.append(int(conc.size()) - beg) else: # `last_valid_item - beg` is number of concordances # between beginnig of last non empty bin and the beginning of current bin # (for cycle is going backwards) abs_freq.append(last_valid_item - beg) last_valid_item = beg else: abs_freq.append(0) freq_dispersion = [ FreqDispersionBin( 100 * i / len(conc_begs), 100 * (i + 0.5) / len(conc_begs), 100 * (i + 1) / len(conc_begs), freq, ) for i, freq in enumerate(reversed(abs_freq)) ] return freq_dispersion
def require_existing_conc(corp: AbstractKCorpus, q: Union[Tuple[str, ...], List[str]]) -> PyConc: """ Load a cached concordance based on a provided corpus and query. If nothing is found, ConcNotFoundException is thrown. """ corpus_manager = CorpusManager(subcpath=[]) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) subchash = getattr(corp, 'subchash', None) status = cache_map.get_calc_status(subchash, q) if status is None: raise ConcNotFoundException('Concordance not found: {}'.format( ', '.join(q))) if status.finished and status.readable: mcorp = corp for qq in reversed(q): # find the right main corp, if aligned if qq.startswith('x-'): mcorp = corpus_manager.get_corpus(qq[2:]) break try: return PyConc(mcorp, 'l', status.cachefile, orig_corp=corp) except manatee.FileAccessError as ex: raise ConcNotFoundException(ex) raise BrokenConcordanceException( 'Concordance broken. File: {}, error: {}'.format( status.cachefile, status.error))
def _get_bg_conc(corp: manatee.Corpus, user_id: int, q: Tuple[str, ...], subchash: Optional[str], samplesize: int, calc_from: int, minsize: int) -> Union[PyConc, EmptyConc]: """ arguments: calc_from - from which operation idx (inclusive) we have to calculate respective results """ cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) # let's create cache records of the operations we'll have to perform if calc_from < len(q): for i in range(calc_from, len(q)): cachefile, _ = cache_map.add_to_map(subchash, q[:i + 1], 0, calc_status=CalcStatus()) if os.path.isfile(cachefile): del_silent(cachefile) logging.getLogger(__name__).warning( f'Removed unbound conc. cache file {cachefile}') app = bgcalc.calc_backend_client(settings) app.send_task('conc_sync_calculate', (user_id, corp.corpname, getattr( corp, 'subcname', None), subchash, q, samplesize), time_limit=TASK_TIME_LIMIT) # for smaller concordances/corpora there is a chance the data # is ready in a few seconds - let's try this: conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q)) else: # return empty yet unfinished concordance to make the client watch the calculation return EmptyConc(corp, cache_map.cache_file_path(subchash, q))
def compute_conc(self, corp: manatee.Corpus, q: Tuple[str, ...], samplesize: int) -> PyConc: start_time = time.time() q = tuple(q) if q[0][0] != 'R': ans_conc = PyConc(corp, q[0][0], q[0][1:], samplesize) else: raise NotImplementedError('Function "online sample" is not supported') logging.getLogger(__name__).debug(f'compute_conc({corp.corpname}, [{", ".join(q)}]) ' f'-> {(time.time() - start_time):.4f}') return ans_conc
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize): """ """ app = bgcalc.calc_backend_client(settings) ans = app.send_task('worker.conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize, TASK_TIME_LIMIT), time_limit=CONC_REGISTER_TASK_LIMIT) ans.get(timeout=CONC_REGISTER_WAIT_LIMIT) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q)) else: return EmptyConc(corp, cache_map.cache_file_path(subchash, q))
def get_existing_conc(corp: manatee.Corpus, q: Tuple[str, ...]) -> manatee.Concordance: cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) subchash = getattr(corp, 'subchash', None) status = cache_map.get_calc_status(subchash, q) if status is None: raise ConcNotFoundException('Concordance not found.') if status.finished and status.readable: mcorp = corp for qq in reversed(q): # find the right main corp, if aligned if qq.startswith('x-'): mcorp = manatee.Corpus(qq[2:]) break return PyConc(mcorp, 'l', status.cachefile, orig_corp=corp) raise BrokenConcordanceException( 'Concordance broken. File: {}, error: {}'.format( status.cachefile, status.error))
def _get_bg_conc(corp: AbstractKCorpus, user_id: int, q: Tuple[str, ...], subchash: Optional[str], samplesize: int, calc_from: int, minsize: int) -> Union[PyConc, InitialConc]: """ arguments: calc_from - from which operation idx (inclusive) we have to calculate respective results """ cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) status = cache_map.get_calc_status(subchash, q) if status and not status.finished: # the calc is already running, the client has to wait and check regularly return InitialConc(corp, status.cachefile) # let's create cache records of the operations we'll have to perform if calc_from < len(q): for i in range(calc_from, len(q)): status = cache_map.add_to_map(subchash, q[:i + 1], ConcCacheStatus(), overwrite=True) # the file cannot be valid as otherwise, calc_from would be higher if os.path.isfile(status.cachefile): del_silent(status.cachefile) logging.getLogger(__name__).warning( f'Removed unbound conc. cache file {status.cachefile}') worker = bgcalc.calc_backend_client(settings) worker.send_task( 'conc_sync_calculate', object.__class__, (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize), time_limit=TASK_TIME_LIMIT) # for smaller concordances/corpora there is a chance the data # is ready in a few seconds - let's try this: conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.readable_cache_path(subchash, q)) else: # return empty yet unfinished concordance to make the client watch the calculation return InitialConc(corp, cache_map.readable_cache_path(subchash, q))
def _freq_dist(self, corp: KCorpus, conc: PyConc, fcrit: str, user_id: int): args = freq_calc.FreqCalcArgs( corpname=corp.corpname, subcname=corp.subcname, subcpath=[], user_id=user_id, pagesize=100, samplesize=0, flimit=1, fcrit=[fcrit], ftt_include_empty=0, rel_mode=1, freq_sort='freq', collator_locale='en_US', # TODO use data provided by corparch plg fmaxitems=1, fpage=1, force_cache=False) freqs = [conc.xfreq_dist( cr, args.flimit, args.freq_sort, args.ftt_include_empty, args.rel_mode, args.collator_locale) for cr in args.fcrit] return freqs[0].get('Items', [])
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize): """ """ cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) status = cache_map.get_calc_status(subchash, q) if not status or status.error: worker = bgcalc.calc_backend_client(settings) ans = worker.send_task( 'conc_register', object.__class__, (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize, TASK_TIME_LIMIT), time_limit=CONC_REGISTER_TASK_LIMIT) ans.get(timeout=CONC_REGISTER_WAIT_LIMIT) conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.readable_cache_path(subchash, q)) else: return InitialConc(corp, cache_map.readable_cache_path(subchash, q))
def find_cached_conc_base( corp: manatee.Corpus, subchash: Optional[str], q: Tuple[str, ...], minsize: int) -> Tuple[Optional[int], manatee.Concordance]: """ Load a concordance from cache starting from a complete operation q[:], then trying q[:-1], q[:-2], q:[:-i] etc. A possible found concordance can be used to skip calculation of already available operations q[:-i]. arguments: minsize -- a minimum concordance size to return immediately (synchronously); please note that unlike wait_for_conc here we accept also 0 returns: a 2-tuple [an index within 'q' where to start with non-cached results], [a concordance instance] """ start_time = time.time() cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) cache_map.refresh_map() calc_status = cache_map.get_calc_status(subchash, q) if calc_status: if calc_status.error is None: corp_mtime = corplib_corp_mtime(corp) if calc_status.created - corp_mtime < 0: logging.getLogger(__name__).warning( 'Removed outdated cache file (older than corpus indices)') cache_map.del_full_entry(subchash, q) else: logging.getLogger(__name__).warning( 'Removed failed calculation cache record (error: {0}'.format( calc_status.error)) cache_map.del_full_entry(subchash, q) if _contains_shuffle_seq(q): srch_from = 1 else: srch_from = len(q) conc = EmptyConc(corp=corp) ans = (0, conc) # try to find the most complete cached operation # (e.g. query + filter + sample) for i in range(srch_from, 0, -1): cache_path = cache_map.cache_file_path(subchash, q[:i]) # now we know that someone already calculated the conc (but it might not be finished yet) if cache_path: try: ready = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q[:i], minsize=minsize) if not ready: if minsize != 0: cancel_async_task(cache_map, subchash, q[:i]) logging.getLogger(__name__).warning( 'Removed unfinished concordance cache record due to exceeded time limit' ) continue _, finished = _check_result(cache_map=cache_map, subchash=subchash, q=q[:i], minsize=minsize) if finished: mcorp = corp for qq in reversed( q[:i]): # find the right main corp, if aligned if qq.startswith('x-'): mcorp = manatee.Corpus(qq[2:]) break conc = PyConc(mcorp, 'l', cache_path, orig_corp=corp) except (ConcCalculationStatusException, manatee.FileAccessError) as ex: logging.getLogger(__name__).error( f'Failed to use cached concordance for {q[:i]}: {ex}') cancel_async_task(cache_map, subchash, q[:i]) continue ans = (i, conc) break logging.getLogger(__name__).debug( f'get_cached_conc({corp.get_conffile()}, [{", ".join(q)}]), ' f'conc: {conc.__class__.__name__}, ' f'missing ops start idx: {i if i < len(q) else "none"}, ' f'time: {(time.time() - start_time):.4f}') return ans