def __call__(self, corpus_name, subc_name, subchash, query, samplesize): corpus_manager = CorpusManager() corpus_obj = corpus_manager.get_Corpus(corpus_name) cache_map = self._cache_factory.get_mapping(corpus_obj) pidfile = self._create_pid_file() cachefile, stored_pidfile = cache_map.add_to_map(subchash, query, 0, pidfile) return dict(cachefile=cachefile, pidfile=pidfile, stored_pidfile=stored_pidfile)
def __call__(self, initial_args, subc_dir, corpus_name, subc_name, subchash, query, samplesize): """ initial_args -- a dict(cachefile=..., pidfile=..., stored_pidfile=...) subc_dir -- a directory where user's subcorpora are stored corpus -- a corpus identifier subc_name -- subcorpus name (should be None if not present) subchash -- an identifier of current subcorpus (None if no subcorpus is in use) query -- a tuple/list containing current query samplesize -- row limit """ sleeptime = None try: corpus_manager = CorpusManager(subcpath=(subc_dir, )) corpus_obj = corpus_manager.get_Corpus(corpus_name, subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) if not initial_args.get('stored_pidfile'): # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may # not be ready yet (this is checked by the 'finished()' method). conc = self.compute_conc(corpus_obj, query, samplesize) sleeptime = 0.1 time.sleep(sleeptime) conc.save(initial_args['cachefile'], False, True, False) # partial while not conc.finished(): # TODO it looks like append=True does not work with Manatee 2.121.1 properly tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile, False, True, False) os.rename(tmp_cachefile, initial_args['cachefile']) time.sleep(sleeptime) sleeptime += 0.1 sizes = self.get_cached_conc_sizes( corpus_obj, query, initial_args['cachefile']) self._update_pidfile(initial_args['pidfile'], last_check=int(time.time()), curr_wait=sleeptime, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize']) tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile) # whole os.rename(tmp_cachefile, initial_args['cachefile']) # update size in map file cache_map.add_to_map(subchash, query, conc.size()) os.remove(initial_args['pidfile']) except Exception as e: # Please note that there is no need to clean any mess (pidfile of failed calculation, # unfinished cached concordance etc.) here as this is performed by _get_cached_conc() # function in case it detects a problem. import traceback logging.getLogger(__name__).error( 'Background calculation error: %s' % e) logging.getLogger(__name__).error(''.join( traceback.format_exception(*sys.exc_info()))) self._update_pidfile(initial_args['pidfile'], last_check=int(time.time()), curr_wait=sleeptime, error=str(e))
def require_existing_conc(corp: AbstractKCorpus, q: Union[Tuple[str, ...], List[str]]) -> PyConc: """ Load a cached concordance based on a provided corpus and query. If nothing is found, ConcNotFoundException is thrown. """ corpus_manager = CorpusManager(subcpath=[]) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) subchash = getattr(corp, 'subchash', None) status = cache_map.get_calc_status(subchash, q) if status is None: raise ConcNotFoundException('Concordance not found: {}'.format( ', '.join(q))) if status.finished and status.readable: mcorp = corp for qq in reversed(q): # find the right main corp, if aligned if qq.startswith('x-'): mcorp = corpus_manager.get_corpus(qq[2:]) break try: return PyConc(mcorp, 'l', status.cachefile, orig_corp=corp) except manatee.FileAccessError as ex: raise ConcNotFoundException(ex) raise BrokenConcordanceException( 'Concordance broken. File: {}, error: {}'.format( status.cachefile, status.error))
def __init__(self, task_id, cache_factory, subc_dirs, corpus_name, subc_name: str, conc_dir: str): super().__init__(task_id, cache_factory) self.corpus_manager = CorpusManager(subcpath=subc_dirs) self.corpus_obj = self.corpus_manager.get_corpus(corpus_name, subcname=subc_name) setattr(self.corpus_obj, '_conc_dir', conc_dir) self.cache_map = self._cache_factory.get_mapping(self.corpus_obj)
def __call__(self, corpus_name, subc_name, subchash, subcpaths, query, samplesize): corpus_manager = CorpusManager(subcpath=subcpaths) corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) new_status = self.create_new_calc_status() cachefile, prev_status = cache_map.add_to_map(subchash, query, 0, new_status) return dict( cachefile=cachefile, already_running=prev_status is not None)
def __call__(self, corpus_name: str, subc_name: str, subchash: Optional[str], subcpaths: Tuple[str, ...], query: Tuple[str, ...], samplesize: int) -> Dict[str, Any]: corpus_manager = CorpusManager(subcpath=subcpaths) corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) new_status = self.create_new_calc_status() cachefile, prev_status = cache_map.add_to_map(subchash, query, 0, new_status) return dict( cachefile=cachefile, already_running=prev_status is not None)
def __call__(self, corpus_name, subc_name, subchash, subcpath, query, samplesize): corpus_manager = CorpusManager(subcpath=(subcpath, )) corpus_obj = corpus_manager.get_Corpus(corpus_name, subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) new_status = self.create_new_calc_status() cachefile, prev_status = cache_map.add_to_map(subchash, query, 0, new_status) return dict(cachefile=cachefile, already_running=prev_status is not None)
def __call__(self, corpus_name, subc_name, subchash, query, samplesize): corpus_manager = CorpusManager() corpus_obj = corpus_manager.get_Corpus(corpus_name) cache_map = self._cache_factory.get_mapping(corpus_obj) pidfile = self._create_pid_file() cachefile, stored_pidfile = cache_map.add_to_map( subchash, query, 0, pidfile) return dict(cachefile=cachefile, pidfile=pidfile, stored_pidfile=stored_pidfile)
def __call__(self, initial_args, subc_dir, corpus_name, subc_name, subchash, query, samplesize): """ initial_args -- a dict(cachefile=..., pidfile=..., stored_pidfile=...) subc_dir -- a directory where user's subcorpora are stored corpus -- a corpus identifier subc_name -- subcorpus name (should be None if not present) subchash -- an identifier of current subcorpus (None if no subcorpus is in use) query -- a tuple/list containing current query samplesize -- row limit """ sleeptime = None try: corpus_manager = CorpusManager(subcpath=(subc_dir,)) corpus_obj = corpus_manager.get_Corpus(corpus_name, subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) if not initial_args.get('stored_pidfile'): # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may # not be ready yet (this is checked by the 'finished()' method). conc = self.compute_conc(corpus_obj, query, samplesize) sleeptime = 0.1 time.sleep(sleeptime) conc.save(initial_args['cachefile'], False, True, False) # partial while not conc.finished(): # TODO it looks like append=True does not work with Manatee 2.121.1 properly tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile, False, True, False) os.rename(tmp_cachefile, initial_args['cachefile']) time.sleep(sleeptime) sleeptime += 0.1 sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile']) self._update_pidfile(initial_args['pidfile'], last_check=int(time.time()), curr_wait=sleeptime, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize']) tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile) # whole os.rename(tmp_cachefile, initial_args['cachefile']) # update size in map file cache_map.add_to_map(subchash, query, conc.size()) os.remove(initial_args['pidfile']) except Exception as e: # Please note that there is no need to clean any mess (pidfile of failed calculation, # unfinished cached concordance etc.) here as this is performed by _get_cached_conc() # function in case it detects a problem. import traceback logging.getLogger(__name__).error('Background calculation error: %s' % e) logging.getLogger(__name__).error(''.join(traceback.format_exception(*sys.exc_info()))) self._update_pidfile(initial_args['pidfile'], last_check=int(time.time()), curr_wait=sleeptime, error=str(e))
def __call__(self, corpus_name: str, subc_name: str, subchash: Optional[str], subcpaths: Tuple[str, ...], query: Tuple[str, ...], samplesize: int) -> Dict[str, Any]: corpus_manager = CorpusManager(subcpath=subcpaths) corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) status = cache_map.get_calc_status(subchash, query) if status is None or status.error: status = self.create_new_calc_status() status = cache_map.add_to_map(subchash, query, status, overwrite=True) already_running = False else: already_running = True return dict(cachefile=status.cachefile, already_running=already_running)
def _load_corp(corp_id, subc: str, user_id): """ Instantiate a manatee.Corpus (or manatee.SubCorpus) instance arguments: corp_id -- a corpus identifier subc -- a subcorpus identifier (None if not defined) user_id -- """ subc_paths = [ os.path.join(settings.get('corpora', 'users_subcpath'), 'published') ] if user_id is not None: subc_paths.insert( 0, os.path.join(settings.get('corpora', 'users_subcpath'), str(user_id))) cm = CorpusManager(subc_paths) return cm.get_corpus(corp_id, '', subc)
async def conc_cache_status_ws_handler( request: web.Request) -> web.WebSocketResponse: ws = web.WebSocketResponse() await ws.prepare(request) logging.debug('Client connected to conc cache status') # wait for concordance parameters msg = await ws.receive() params = json.loads(msg.data) logging.debug('Received conc parameters: %s', params) subcpath = [ os.path.join(settings.get('corpora', 'users_subcpath'), 'published') ] with plugins.runtime.AUTH as auth: if not auth.is_anonymous(params['user_id']): subcpath.insert( 0, os.path.join(settings.get('corpora', 'users_subcpath'), str(params['user_id']))) cm = CorpusManager(subcpath) corp = cm.get_corpus(corpname=params['corp_id'], subcname=params.get('subc_path', None)) # check until finished while not ws.closed: try: response = get_conc_cache_status(corp, params['conc_id']) except Exception as e: response = {'error': str(e), 'finished': True} await ws.send_json(response) if response['finished']: await ws.close() else: await asyncio.sleep(CONC_CACHE_STATUS_REFRESH_PERIOD) logging.debug('Client disconnected from conc cache status') return ws
class ConcSyncCalculation(GeneralWorker): """ A worker for calculating a concordance synchronously (from Manatee API point of view) but still in background. Please note that the worker expects you to create required concordance cache mapping records. """ def __init__(self, task_id, cache_factory, subc_dirs, corpus_name, subc_name: str, conc_dir: str): super().__init__(task_id, cache_factory) self.corpus_manager = CorpusManager(subcpath=subc_dirs) self.corpus_obj = self.corpus_manager.get_corpus(corpus_name, subcname=subc_name) setattr(self.corpus_obj, '_conc_dir', conc_dir) self.cache_map = self._cache_factory.get_mapping(self.corpus_obj) def _mark_calc_states_err(self, subchash: Optional[str], query: Tuple[str, ...], from_idx: int, err: BaseException): for i in range(from_idx, len(query)): self.cache_map.update_calc_status(subchash, query[:i + 1], error=err, finished=True) def __call__(self, subchash, query: Tuple[str, ...], samplesize: int): try: calc_from, conc = find_cached_conc_base(self.corpus_obj, subchash, query, minsize=0) if isinstance( conc, InitialConc ): # we have nothing, let's start with the 1st operation only for i in range(0, len(query)): self.cache_map.add_to_map( subchash, query[:i + 1], ConcCacheStatus(task_id=self._task_id), overwrite=True) calc_status = self.cache_map.get_calc_status( subchash, query[:1]) conc = self.compute_conc(self.corpus_obj, query[:1], samplesize) conc.sync() conc.save(calc_status.cachefile) os.chmod(calc_status.cachefile, 0o664) self.cache_map.update_calc_status(subchash, query[:1], readable=True, finished=True, concsize=conc.size()) calc_from = 1 else: for i in range(calc_from, len(query)): self.cache_map.add_to_map( subchash, query[:i + 1], ConcCacheStatus(task_id=self._task_id), overwrite=True) except Exception as ex: logging.getLogger(__name__).error(ex) manatee_err = extract_manatee_error(ex) norm_err = manatee_err if manatee_err else ex self._mark_calc_states_err(subchash, query, 0, norm_err) return # save additional concordance actions to cache (e.g. sample, aligned corpus without a query,...) for act in range(calc_from, len(query)): try: command, args = query[act][0], query[act][1:] conc.exec_command(command, args) if command in 'gae': # user specific/volatile actions, cannot save raise NotImplementedError( f'Cannot run command {command} in background') # TODO calc_status = self.cache_map.get_calc_status( subchash, query[:act + 1]) conc.save(calc_status.cachefile) os.chmod(calc_status.cachefile, 0o664) self.cache_map.update_calc_status(subchash, query[:act + 1], readable=True, finished=True, concsize=conc.size()) except Exception as ex: self._mark_calc_states_err(subchash, query, act, ex) logging.getLogger(__name__).error(ex) return
def __call__(self, initial_args, subc_dirs, corpus_name, subc_name, subchash, query, samplesize): """ initial_args -- a dict(cachefile=..., already_running=...) subc_dirs -- a list of directories where to look for subcorpora corpus -- a corpus identifier subc_name -- subcorpus name (should be None if not present) subchash -- an identifier of current subcorpus (None if no subcorpus is in use) query -- a tuple/list containing current query samplesize -- row limit """ cache_map = None try: corpus_manager = CorpusManager(subcpath=subc_dirs) corpus_obj = corpus_manager.get_corpus(corpus_name, subcname=subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) if not initial_args['already_running']: # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may # not be ready yet (this is checked by the 'finished()' method). conc = self.compute_conc(corpus_obj, query, samplesize) sleeptime = 0.1 time.sleep(sleeptime) cachefile = initial_args['cachefile'] conc.save(cachefile, False, True) # partial os.chmod(cachefile, 0o664) cache_map.update_calc_status(subchash, query, readable=True, task_id=self._task_id) while not conc.finished(): conc.save(cachefile + '.tmp', False, True) os.rename(cachefile + '.tmp', cachefile) sizes = self.get_cached_conc_sizes(corpus_obj, query) cache_map.update_calc_status( subchash, query, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize'], arf=None, task_id=self._task_id) time.sleep(sleeptime) sleeptime += 0.1 conc.save(cachefile + '.tmp') # whole os.rename(cachefile + '.tmp', cachefile) os.chmod(cachefile, 0o664) sizes = self.get_cached_conc_sizes(corpus_obj, query) cache_map.update_calc_status( subchash, query, finished=sizes['finished'], concsize=conc.size(), fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize'], arf=round(conc.compute_ARF(), 2) if not corpus_obj.is_subcorpus else None, task_id=self._task_id) except Exception as e: # Please note that there is no need to clean any mess (unfinished cached concordance etc.) # here as this is performed by _get_cached_conc() # function in case it detects a problem. manatee_err = extract_manatee_error(e) norm_err = manatee_err if manatee_err else e if cache_map is not None: cache_map.update_calc_status(subchash, query, finished=True, error=norm_err)
def __call__(self, initial_args, subc_dirs, corpus_name, subc_name, subchash, query, samplesize): """ initial_args -- a dict(cachefile=..., already_running=...) subc_dirs -- a list of directories where to look for subcorpora corpus -- a corpus identifier subc_name -- subcorpus name (should be None if not present) subchash -- an identifier of current subcorpus (None if no subcorpus is in use) query -- a tuple/list containing current query samplesize -- row limit """ sleeptime = None cache_map = None try: corpus_manager = CorpusManager(subcpath=subc_dirs) corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) if not initial_args['already_running']: # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may # not be ready yet (this is checked by the 'finished()' method). conc = self.compute_conc(corpus_obj, query, samplesize) sleeptime = 0.1 time.sleep(sleeptime) conc.save(initial_args['cachefile'], False, True, False) # partial while not conc.finished(): # TODO it looks like append=True does not work with Manatee 2.121.1 properly tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile, False, True, False) os.rename(tmp_cachefile, initial_args['cachefile']) time.sleep(sleeptime) sleeptime += 0.1 sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile']) cache_map.update_calc_status(subchash, query, dict( curr_wait=sleeptime, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize'], arf=None, task_id=self._task_id)) tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile) # whole os.rename(tmp_cachefile, initial_args['cachefile']) sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile']) cache_map.update_calc_status(subchash, query, dict( curr_wait=sleeptime, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize'], arf=round(conc.compute_ARF(), 2) if not is_subcorpus(corpus_obj) else None, task_id=self._task_id)) # update size in map file cache_map.add_to_map(subchash, query, conc.size()) except Exception as e: # Please note that there is no need to clean any mess (unfinished cached concordance etc.) # here as this is performed by _get_cached_conc() # function in case it detects a problem. import traceback logging.getLogger(__name__).error('Background calculation error: %s' % e) logging.getLogger(__name__).error(''.join(traceback.format_exception(*sys.exc_info()))) if cache_map is not None: cache_map.update_calc_status( subchash, query, dict( finished=True, curr_wait=sleeptime, error=e.message if getattr(e, 'message', None) else e.__class__.__name__))
def __init__(self, conf, ident): super().__init__(ident) self._conf = conf fixed_corp = conf.get('corpus') self._preset_corp = CorpusManager().get_corpus(fixed_corp) if fixed_corp else None
def __call__(self, initial_args, subc_dirs, corpus_name, subc_name, subchash, query, samplesize): """ initial_args -- a dict(cachefile=..., already_running=...) subc_dirs -- a list of directories where to look for subcorpora corpus -- a corpus identifier subc_name -- subcorpus name (should be None if not present) subchash -- an identifier of current subcorpus (None if no subcorpus is in use) query -- a tuple/list containing current query samplesize -- row limit """ cache_map = None try: corpus_manager = CorpusManager(subcpath=subc_dirs) corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) if not initial_args['already_running']: # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may # not be ready yet (this is checked by the 'finished()' method). conc = self.compute_conc(corpus_obj, query, samplesize) sleeptime = 0.1 time.sleep(sleeptime) conc.save(initial_args['cachefile'], False, True, False) # partial while not conc.finished(): # TODO it looks like append=True does not work with Manatee 2.121.1 properly tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile, False, True, False) os.rename(tmp_cachefile, initial_args['cachefile']) time.sleep(sleeptime) sleeptime += 0.1 sizes = self.get_cached_conc_sizes( corpus_obj, query, initial_args['cachefile']) cache_map.update_calc_status( subchash, query, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize'], arf=None, task_id=self._task_id) tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile) # whole os.rename(tmp_cachefile, initial_args['cachefile']) sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile']) cache_map.update_calc_status( subchash, query, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize'], arf=round(conc.compute_ARF(), 2) if not is_subcorpus(corpus_obj) else None, task_id=self._task_id) # update size in map file cache_map.add_to_map(subchash, query, conc.size()) except Exception as e: # Please note that there is no need to clean any mess (unfinished cached concordance etc.) # here as this is performed by _get_cached_conc() # function in case it detects a problem. import traceback logging.getLogger(__name__).error( 'Background calculation error: %s' % e) logging.getLogger(__name__).error(''.join( traceback.format_exception(*sys.exc_info()))) if cache_map is not None: cache_map.update_calc_status(subchash, query, finished=True, error=e)
def find_cached_conc_base( corp: AbstractKCorpus, subchash: Optional[str], q: Tuple[str, ...], minsize: int) -> Tuple[Optional[int], Union[PyConc, InitialConc]]: """ Load a concordance from cache starting from a complete operation q[:], then trying q[:-1], q[:-2], q:[:-i] etc. A possible found concordance can be used to skip calculation of already available operations q[:-i]. arguments: minsize -- a minimum concordance size to return immediately (synchronously); please note that unlike wait_for_conc here we accept also 0 returns: a 2-tuple [an index within 'q' where to start with non-cached results], [a concordance instance] """ corpus_manager = CorpusManager(subcpath=[]) start_time = time.time() cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) cache_map.refresh_map() calc_status = cache_map.get_calc_status(subchash, q) if calc_status: if calc_status.error is None: if calc_status.created - corp.corp_mtime < 0: logging.getLogger(__name__).warning( 'Removed outdated cache file (older than corpus indices)') cache_map.del_full_entry(subchash, q) else: logging.getLogger(__name__).warning( 'Removed failed calculation cache record (error: {0}'.format( calc_status.error)) cache_map.del_full_entry(subchash, q) raise calc_status.normalized_error if _contains_shuffle_seq(q): srch_from = 1 else: srch_from = len(q) conc = InitialConc(corp=corp) ans = (0, conc) # try to find the most complete cached operation # (e.g. query + filter + sample) for i in range(srch_from, 0, -1): cache_path = cache_map.readable_cache_path(subchash, q[:i]) # now we know that someone already calculated the conc (but it might not be finished yet) if cache_path: try: ready = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q[:i], minsize=minsize) if not ready: if minsize != 0: cancel_conc_task(cache_map, subchash, q[:i]) logging.getLogger(__name__).warning( 'Removed unfinished concordance cache record due to exceeded time limit' ) continue _, finished = _check_result(cache_map=cache_map, subchash=subchash, q=q[:i], minsize=minsize) if finished: mcorp = corp for qq in reversed( q[:i]): # find the right main corp, if aligned if qq.startswith('x-'): mcorp = corpus_manager.get_corpus(qq[2:]) break conc = PyConc(mcorp, 'l', cache_path, orig_corp=corp) except (ConcCalculationStatusException, manatee.FileAccessError) as ex: logging.getLogger(__name__).error( f'Failed to use cached concordance for {q[:i]}: {ex}') cancel_conc_task(cache_map, subchash, q[:i]) continue ans = (i, conc) break logging.getLogger(__name__).debug( f'find_cached_conc_base({corp.get_conffile()}, [{", ".join(q)}]), ' f'conc: {conc.__class__.__name__}, ' f'must calc ops from {i} to {len(q)}, ' f'time: {(time.time() - start_time):.4f}') return ans