Exemplo n.º 1
0
def _get_bg_conc(corp: manatee.Corpus, user_id: int, q: Tuple[str, ...],
                 subchash: Optional[str], samplesize: int, calc_from: int,
                 minsize: int) -> Union[PyConc, EmptyConc]:
    """
    arguments:
    calc_from - from which operation idx (inclusive) we have to calculate respective results
    """
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    # let's create cache records of the operations we'll have to perform
    if calc_from < len(q):
        for i in range(calc_from, len(q)):
            cachefile, _ = cache_map.add_to_map(subchash,
                                                q[:i + 1],
                                                0,
                                                calc_status=CalcStatus())
            if os.path.isfile(cachefile):
                del_silent(cachefile)
                logging.getLogger(__name__).warning(
                    f'Removed unbound conc. cache file {cachefile}')
        app = bgcalc.calc_backend_client(settings)
        app.send_task('conc_sync_calculate',
                      (user_id, corp.corpname, getattr(
                          corp, 'subcname', None), subchash, q, samplesize),
                      time_limit=TASK_TIME_LIMIT)
    # for smaller concordances/corpora there is a chance the data
    # is ready in a few seconds - let's try this:
    conc_avail = wait_for_conc(cache_map=cache_map,
                               subchash=subchash,
                               q=q,
                               minsize=minsize)
    if conc_avail:
        return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
    else:
        # return empty yet unfinished concordance to make the client watch the calculation
        return EmptyConc(corp, cache_map.cache_file_path(subchash, q))
Exemplo n.º 2
0
def _get_async_conc(corp, user_id, q, save, subchash, samplesize, fullsize, minsize):
    """
    Note: 'save' argument is present because of bonito-open-3.45.11 compatibility but it is
    currently not used ----- TODO remove it
    """
    backend = settings.get('calc_backend', 'type')
    if backend == 'multiprocessing':
        from concworker import mp
        mp.create_task(user_id, corp, subchash, q, samplesize).start()
    elif backend in ('celery', 'konserver'):
        import bgcalc
        app = bgcalc.calc_backend_client(settings)
        ans = app.send_task('worker.conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None),
                                                     subchash, q, samplesize, TASK_TIME_LIMIT), time_limit=10)  # register should be fast
        ans.get()  # = wait for task registration
    else:
        raise ValueError('Unknown concordance calculation backend: %s' % (backend,))

    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    try:
        _wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize)
    except Exception as e:
        _cancel_async_task(cache_map, subchash, q)
        raise e
    return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
Exemplo n.º 3
0
    def submit(self, request):
        form_args = WordlistFormArgs()
        form_args.update_by_user_query(request.json)
        worker = calc_backend_client(settings)
        ans = dict(corpname=self.args.corpname, usesubcorp=self.args.usesubcorp,
                   freq_files_avail=True, subtasks=[])
        async_res = worker.send_task(
            'get_wordlist', object.__class__,
            args=(form_args.to_dict(), self.corp.size, self.session_get('user', 'id')))
        bg_result = async_res.get()
        if isinstance(bg_result, MissingSubCorpFreqFile):
            data_calc = freq_calc.build_arf_db(self.session_get('user', 'id'), self.corp, form_args.wlattr)
            if type(data_calc) is list:
                for subtask in data_calc:
                    self._store_async_task(subtask)
                    ans['subtasks'].append(subtask.to_dict())
                ans['freq_files_avail'] = False
            else:
                # TODO we should join the current calculation here instead of throwing an error
                raise WordlistError('The data calculation is already running')
        elif isinstance(bg_result, Exception):
            raise bg_result
        self._curr_wlform_args = form_args

        def on_conc_store(query_ids, history_ts, result):
            result['wl_query_id'] = query_ids[0]
            if history_ts:
                self._store_last_search('wlist', query_ids[0])

        self.on_conc_store = on_conc_store
        return ans
Exemplo n.º 4
0
def get_conc_cache_status(corp: KCorpus, conc_id: str):
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    q = []
    try:
        with plugins.runtime.QUERY_PERSISTENCE as qp:
            data = qp.open(conc_id)
            q = data.get('q', [])
        cache_status = cache_map.get_calc_status(corp.subchash,
                                                 data.get('q', []))
        if cache_status is None:  # conc is not cached nor calculated
            return Exception('Concordance calculation is lost')
        elif not cache_status.finished and cache_status.task_id:
            # we must also test directly a respective task as might have been killed
            # and thus failed to store info to cache metadata
            worker = calc_backend_client(settings)
            err = worker.get_task_error(cache_status.task_id)
            if err is not None:
                raise err
        return {
            'finished': cache_status.finished,
            'concsize': cache_status.concsize,
            'fullsize': cache_status.fullsize,
            'relconcsize': cache_status.relconcsize,
            'arf': cache_status.arf
        }
    except CalcTaskNotFoundError as ex:
        cancel_conc_task(cache_map, corp.subchash, q)
        raise Exception(f'Concordance calculation is lost: {ex}')
    except Exception as ex:
        cancel_conc_task(cache_map, corp.subchash, q)
        raise ex
Exemplo n.º 5
0
def calculate_colls(coll_args):
    """
    Calculates required collocations based on passed arguments.
    Function is able to reuse cached values and utilize configured
    backend (either Celery or multiprocessing).

    returns:
    a dictionary ready to be used in a respective template (collx.tmpl)
    (keys: Head, Items, cmaxitems, attrname, processing, collstart, lastpage)
    """
    if coll_args.num_lines > 0:
        collstart = 0
        collend = coll_args.num_lines
    else:
        collstart = (int(coll_args.collpage) - 1) * \
            int(coll_args.citemsperpage) + int(coll_args.line_offset)
        collend = collstart + int(coll_args.citemsperpage) + 1

    cache = CollCalcCache(corpname=coll_args.corpname, subcname=coll_args.subcname, subcpath=coll_args.subcpath,
                          user_id=coll_args.user_id, q=coll_args.q, minsize=coll_args.minsize, save=coll_args.save,
                          samplesize=coll_args.samplesize)
    collocs, cache_path = cache.get(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns,
                                    cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminbgr=coll_args.cminbgr,
                                    cminfreq=coll_args.cminfreq)
    if collocs is None:
        num_fetch_items = CollCalcCache.MANATEE_DEFAULT_NUM_FETCH_LINES
    else:
        num_fetch_items = len(collocs['Items'])

    if collocs is None or collend > num_fetch_items:
        if os.path.isfile(cache_path):  # cache avail. but not enough items
            os.unlink(cache_path)
        if collend >= num_fetch_items:
            num_fetch_items += (collend - num_fetch_items) + 10 * \
                int(coll_args.citemsperpage)  # TODO heuristics :)

        coll_args.cache_path = cache_path
        coll_args.num_fetch_items = num_fetch_items

        backend = settings.get('calc_backend', 'type')
        if backend in ('celery', 'konserver'):
            import bgcalc
            app = bgcalc.calc_backend_client(settings)
            res = app.send_task('worker.calculate_colls', args=(coll_args.to_dict(),),
                                time_limit=TASK_TIME_LIMIT)
            # worker task caches the value AFTER the result is returned (see worker.py)
            ans = res.get()
        elif backend == 'multiprocessing':
            ans = calculate_colls_mp(coll_args)
    else:
        ans = dict(data=collocs, processing=0)
    result = dict(
        Head=ans['data']['Head'],
        attrname=coll_args.cattr,
        processing=ans['processing'],
        collstart=collstart,
        lastpage=0 if collstart + coll_args.citemsperpage < len(ans['data']['Items']) else 1,
        Items=ans['data']['Items'][collstart:collend - 1]
    )
    return result
Exemplo n.º 6
0
    def freq_intersection(self, request):
        """
        Run a paradigmatic query out of existing concordances.

        submitted JSON structure - see models.pquery.common.FreqIntersectionArgs
        """
        worker = bgcalc.calc_backend_client(settings)
        corp_info = self.get_corpus_info(self.args.corpname)

        self._curr_pquery_args = PqueryFormArgs(corpname=self.corp.corpname,
                                                attr=self._get_default_attr(),
                                                position='0<0~0>0')
        self._curr_pquery_args.update_by_user_query(request.json)
        conc_forms, raw_queries = _load_conc_queries(
            self._plugin_ctx, self._curr_pquery_args.conc_ids,
            self.args.corpname, 'query')
        if self._curr_pquery_args.conc_subset_complements:
            conc_forms2, raw_queries2 = _load_conc_queries(
                self._plugin_ctx,
                self._curr_pquery_args.conc_subset_complements.conc_ids,
                self.args.corpname, 'query')
            raw_queries.update(raw_queries2)
        if self._curr_pquery_args.conc_superset:
            conc_forms3, raw_queries3 = _load_conc_queries(
                self._plugin_ctx,
                [self._curr_pquery_args.conc_superset.conc_id],
                self.args.corpname, 'query')
            raw_queries.update(raw_queries3)

        calc_args = (self._curr_pquery_args, raw_queries, self.subcpath,
                     self.session_get('user', 'id'), corp_info.collator_locale
                     if corp_info.collator_locale else 'en_US')
        task_status = worker.send_task('calc_merged_freqs',
                                       object.__class__,
                                       args=calc_args,
                                       time_limit=TASK_TIME_LIMIT)
        sq_items = []
        for conc_id in self._curr_pquery_args.conc_ids:
            sq_items.append(
                conc_forms[conc_id]['curr_queries'][self.args.corpname])
        shortened_q = ' && '.join(f'{{{q}}}' for q in sq_items)
        shortened_q = f'{shortened_q} -> {self._curr_pquery_args.attr}'

        def on_conc_store(query_ids, history_ts, result):
            async_task = AsyncTaskStatus(
                status=task_status.status,
                ident=task_status.id,
                category=AsyncTaskStatus.CATEGORY_PQUERY,
                label=shortened_q,
                args=dict(query_id=query_ids[0], last_update=time.time()),
                url=self.create_url('pquery/result',
                                    dict(q=f'~{query_ids[0]}')))
            self._store_async_task(async_task)
            result['task'] = async_task.to_dict()
            if history_ts:
                self._store_last_search('pquery', query_ids[0])

        self.on_conc_store = on_conc_store
        return {}
Exemplo n.º 7
0
 def process(self, attrname='', worker_tasks=None):
     backend = settings.get('calc_backend', 'type')
     if worker_tasks and backend in ('celery', 'konserver'):
         import bgcalc
         app = bgcalc.calc_backend_client(settings)
         for t in worker_tasks:
             tr = app.AsyncResult(t)
             if tr.status == 'FAILURE':
                 raise bgcalc.ExternalTaskError('Task %s failed' % (t, ))
     return {'status': freq_calc.build_arf_db_status(self.corp, attrname)}
Exemplo n.º 8
0
 def process(self, attrname='', worker_tasks=None):
     backend = settings.get('calc_backend', 'type')
     if worker_tasks and backend in ('celery', 'rq'):
         import bgcalc
         worker = bgcalc.calc_backend_client(settings)
         for t in worker_tasks:
             tr = worker.AsyncResult(t)
             if tr.status == 'FAILURE':
                 raise BgCalcError(f'Task {t} failed')
     return {'status': freq_calc.build_arf_db_status(self.corp, attrname)}
Exemplo n.º 9
0
 def process(self, attrname='', worker_tasks=None):
     backend = settings.get('calc_backend', 'type')
     if worker_tasks and backend in ('celery', 'konserver'):
         import bgcalc
         app = bgcalc.calc_backend_client(settings)
         for t in worker_tasks:
             tr = app.AsyncResult(t)
             if tr.status == 'FAILURE':
                 raise bgcalc.ExternalTaskError('Task %s failed' % (t,))
     return {'status': freq_calc.build_arf_db_status(self.corp, attrname)}
Exemplo n.º 10
0
def calculate_colls(coll_args):
    """
    Calculates required collocations based on passed arguments.
    Function is able to reuse cached values and utilize configured
    backend (either Celery or multiprocessing).

    returns:
    a dictionary ready to be used in a respective template (collx.tmpl)
    (keys: Head, Items, cmaxitems, attrname, processing, collstart, lastpage)
    """
    if coll_args.num_lines > 0:
        collstart = 0
        collend = coll_args.num_lines
    else:
        collstart = (int(coll_args.collpage) - 1) * \
            int(coll_args.citemsperpage) + int(coll_args.line_offset)
        collend = collstart + int(coll_args.citemsperpage) + 1

    cache = CollCalcCache(corpname=coll_args.corpname, subcname=coll_args.subcname, subcpath=coll_args.subcpath,
                          user_id=coll_args.user_id, q=coll_args.q, save=coll_args.save,
                          samplesize=coll_args.samplesize)
    collocs, cache_path = cache.get(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns,
                                    cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminbgr=coll_args.cminbgr,
                                    cminfreq=coll_args.cminfreq)
    if collocs is None:
        num_fetch_items = CollCalcCache.MANATEE_DEFAULT_NUM_FETCH_LINES
    else:
        num_fetch_items = len(collocs['Items'])

    if collocs is None or collend > num_fetch_items:
        if os.path.isfile(cache_path):  # cache avail. but not enough items
            os.unlink(cache_path)
        if collend >= num_fetch_items:
            num_fetch_items += (collend - num_fetch_items) + 10 * \
                int(coll_args.citemsperpage)  # TODO heuristics :)

        coll_args.cache_path = cache_path
        coll_args.num_fetch_items = num_fetch_items
        app = bgcalc.calc_backend_client(settings)
        res = app.send_task('worker.calculate_colls', args=(coll_args.to_dict(),),
                            time_limit=TASK_TIME_LIMIT)
        # worker task caches the value AFTER the result is returned (see worker.py)
        ans = res.get()
    else:
        ans = dict(data=collocs, processing=0)
    result = dict(
        Head=ans['data']['Head'],
        attrname=coll_args.cattr,
        processing=ans['processing'],
        collstart=collstart,
        lastpage=0 if collstart + coll_args.citemsperpage < len(ans['data']['Items']) else 1,
        Items=ans['data']['Items'][collstart:collend - 1]
    )
    return result
Exemplo n.º 11
0
def calculate_freqs(args):
    """
    Calculates a frequency distribution based on a defined concordance and frequency-related arguments.
    The class is able to cache the data in a background process/task. This prevents KonText to calculate
    (via Manatee) full frequency list again and again (e.g. if user moves from page to page).
    """
    cache = FreqCalcCache(corpname=args.corpname, subcname=args.subcname, user_id=args.user_id, subcpath=args.subcpath,
                          minsize=args.minsize, q=args.q, fromp=args.fromp, pagesize=args.pagesize, save=args.save,
                          samplesize=args.samplesize)
    calc_result, cache_path = cache.get(fcrit=args.fcrit, flimit=args.flimit, freq_sort=args.freq_sort, ml=args.ml,
                                        ftt_include_empty=args.ftt_include_empty, rel_mode=args.rel_mode,
                                        collator_locale=args.collator_locale)
    if calc_result is None:
        backend = settings.get('calc_backend', 'type')
        if backend in ('celery', 'konserver'):
            import bgcalc
            args.cache_path = cache_path
            app = bgcalc.calc_backend_client(settings)
            res = app.send_task('worker.calculate_freqs', args=(args.to_dict(),),
                                time_limit=TASK_TIME_LIMIT)
            # worker task caches the value AFTER the result is returned (see worker.py)
            calc_result = res.get()
        if backend == 'multiprocessing':
            calc_result = calculate_freqs_mp(args)

    data = calc_result['freqs']
    conc_size = calc_result['conc_size']
    lastpage = None
    if len(data) == 1:  # a single block => pagination
        total_length = len(data[0]['Items']) if 'Items' in data[0] else 0
        items_per_page = args.fmaxitems
        fstart = (args.fpage - 1) * args.fmaxitems + args.line_offset
        fmaxitems = args.fmaxitems * args.fpage + 1 + args.line_offset
        if total_length < fmaxitems:
            lastpage = 1
        else:
            lastpage = 0
        ans = [dict(Total=total_length,
                    TotalPages=int(math.ceil(total_length / float(items_per_page))),
                    Items=data[0]['Items'][fstart:fmaxitems - 1] if 'Items' in data[0] else [],
                    Head=data[0].get('Head', []))]
    else:
        for item in data:
            if 'Items' not in item:
                item['Items'] = []
            item['Total'] = len(item['Items'])
            item['TotalPages'] = None
        ans = data
        fstart = None
    return dict(lastpage=lastpage, data=ans, fstart=fstart, fmaxitems=args.fmaxitems, conc_size=conc_size)
Exemplo n.º 12
0
def calculate_freqs(args):
    """
    Calculates a frequency distribution based on a defined concordance and frequency-related arguments.
    The class is able to cache the data in a background process/task. This prevents KonText to calculate
    (via Manatee) full frequency list again and again (e.g. if user moves from page to page).
    """
    cache = FreqCalcCache(corpname=args.corpname, subcname=args.subcname, user_id=args.user_id, subcpath=args.subcpath,
                          minsize=args.minsize, q=args.q, fromp=args.fromp, pagesize=args.pagesize, save=args.save,
                          samplesize=args.samplesize)
    calc_result, cache_path = cache.get(fcrit=args.fcrit, flimit=args.flimit, freq_sort=args.freq_sort, ml=args.ml,
                                        ftt_include_empty=args.ftt_include_empty, rel_mode=args.rel_mode,
                                        collator_locale=args.collator_locale)
    if calc_result is None:
        backend = settings.get('calc_backend', 'type')
        if backend in ('celery', 'konserver'):
            import bgcalc
            args.cache_path = cache_path
            app = bgcalc.calc_backend_client(settings)
            res = app.send_task('worker.calculate_freqs', args=(args.to_dict(),),
                                time_limit=TASK_TIME_LIMIT)
            # worker task caches the value AFTER the result is returned (see worker.py)
            calc_result = res.get()
        if backend == 'multiprocessing':
            calc_result = calculate_freqs_mp(args)

    data = calc_result['freqs']
    conc_size = calc_result['conc_size']
    lastpage = None
    if len(data) == 1:  # a single block => pagination
        total_length = len(data[0]['Items']) if 'Items' in data[0] else 0
        items_per_page = args.fmaxitems
        fstart = (args.fpage - 1) * args.fmaxitems + args.line_offset
        fmaxitems = args.fmaxitems * args.fpage + 1 + args.line_offset
        if total_length < fmaxitems:
            lastpage = 1
        else:
            lastpage = 0
        ans = [dict(Total=total_length,
                    TotalPages=int(math.ceil(total_length / float(items_per_page))),
                    Items=data[0]['Items'][fstart:fmaxitems - 1] if 'Items' in data[0] else [],
                    Head=data[0].get('Head', []))]
    else:
        for item in data:
            if 'Items' not in item:
                item['Items'] = []
            item['Total'] = len(item['Items'])
            item['TotalPages'] = None
        ans = data
        fstart = None
    return dict(lastpage=lastpage, data=ans, fstart=fstart, fmaxitems=args.fmaxitems, conc_size=conc_size)
Exemplo n.º 13
0
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize):
    """
    """
    app = bgcalc.calc_backend_client(settings)
    ans = app.send_task('worker.conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None),
                                                 subchash, q, samplesize, TASK_TIME_LIMIT),
                        time_limit=CONC_REGISTER_TASK_LIMIT)
    ans.get(timeout=CONC_REGISTER_WAIT_LIMIT)
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize)
    if conc_avail:
        return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
    else:
        return EmptyConc(corp, cache_map.cache_file_path(subchash, q))
Exemplo n.º 14
0
def cancel_async_task(cache_map: AbstractConcCache, subchash: Optional[str],
                      q: Tuple[str, ...]):
    cachefile = cache_map.cache_file_path(subchash, q)
    status = cache_map.get_calc_status(subchash, q)
    if status:
        try:
            if status.task_id:
                app = bgcalc.calc_backend_client(settings)
                app.control.revoke(status.task_id,
                                   terminate=True,
                                   signal='SIGKILL')
        except IOError:
            pass
    cache_map.del_entry(subchash, q)
    del_silent(cachefile)
Exemplo n.º 15
0
def _cancel_async_task(cache_map, subchash, q):
    cachefile = cache_map.cache_file_path(subchash, q)
    status = cache_map.get_calc_status(subchash, q)
    backend = settings.get('calc_backend', 'type')
    if backend == 'multiprocessing':
        logging.getLogger(__name__).warning('Unable to cancel async task in multiprocessing mode')
    elif backend in ('celery', 'konserver') and status:
        import bgcalc
        try:
            if status.task_id:
                app = bgcalc.calc_backend_client(settings)
                app.control.revoke(status.task_id, terminate=True, signal='SIGKILL')
        except IOError:
            pass
    cache_map.del_entry(subchash, q)
    _del_silent(cachefile)
Exemplo n.º 16
0
def calculate_freqs_ct(args):
    """
    note: this is called by webserver
    """
    try:
        app = bgcalc.calc_backend_client(settings)
        res = app.send_task('calculate_freqs_ct',
                            args=(args.to_dict(), ),
                            time_limit=TASK_TIME_LIMIT)
        calc_result = res.get()
    except Exception as ex:
        if is_celery_user_error(ex):
            raise UserActionException(str(ex)) from ex
        else:
            raise ex
    return calc_result
Exemplo n.º 17
0
def cancel_conc_task(cache_map: AbstractConcCache, subchash: Optional[str],
                     q: Tuple[str, ...]):
    """
    Removes conc. cache entry and also a respective calculation task (silently).
    """
    cachefile = cache_map.readable_cache_path(subchash, q)
    status = cache_map.get_calc_status(subchash, q)
    if status:
        try:
            if status.task_id:
                worker = bgcalc.calc_backend_client(settings)
                worker.control.revoke(status.task_id,
                                      terminate=True,
                                      signal='SIGKILL')
        except (IOError, CalcTaskNotFoundError):
            pass
    cache_map.del_entry(subchash, q)
    del_silent(cachefile)
Exemplo n.º 18
0
def _get_bg_conc(corp: AbstractKCorpus, user_id: int, q: Tuple[str, ...],
                 subchash: Optional[str], samplesize: int, calc_from: int,
                 minsize: int) -> Union[PyConc, InitialConc]:
    """
    arguments:
    calc_from - from which operation idx (inclusive) we have to calculate respective results
    """
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)

    status = cache_map.get_calc_status(subchash, q)
    if status and not status.finished:  # the calc is already running, the client has to wait and check regularly
        return InitialConc(corp, status.cachefile)
    # let's create cache records of the operations we'll have to perform
    if calc_from < len(q):
        for i in range(calc_from, len(q)):
            status = cache_map.add_to_map(subchash,
                                          q[:i + 1],
                                          ConcCacheStatus(),
                                          overwrite=True)
            # the file cannot be valid as otherwise, calc_from would be higher
            if os.path.isfile(status.cachefile):
                del_silent(status.cachefile)
                logging.getLogger(__name__).warning(
                    f'Removed unbound conc. cache file {status.cachefile}')
        worker = bgcalc.calc_backend_client(settings)
        worker.send_task(
            'conc_sync_calculate',
            object.__class__,
            (user_id, corp.corpname, getattr(corp, 'subcname',
                                             None), subchash, q, samplesize),
            time_limit=TASK_TIME_LIMIT)
    # for smaller concordances/corpora there is a chance the data
    # is ready in a few seconds - let's try this:
    conc_avail = wait_for_conc(cache_map=cache_map,
                               subchash=subchash,
                               q=q,
                               minsize=minsize)
    if conc_avail:
        return PyConc(corp, 'l', cache_map.readable_cache_path(subchash, q))
    else:
        # return empty yet unfinished concordance to make the client watch the calculation
        return InitialConc(corp, cache_map.readable_cache_path(subchash, q))
Exemplo n.º 19
0
def calculate_colls(coll_args: CollCalcArgs) -> CalculateCollsResult:
    """
    Calculates required collocations based on passed arguments.
    Result values are cached.

    returns:
    a dictionary ready to be used in a respective template (collx.tmpl)
    (keys: Head, Items, cmaxitems, attrname, processing, collstart, lastpage)
    """
    collstart = (coll_args.collpage - 1) * coll_args.citemsperpage
    collend = collstart + coll_args.citemsperpage
    cache = CollCalcCache(corpname=coll_args.corpname,
                          subcname=coll_args.subcname,
                          subcpath=coll_args.subcpath,
                          user_id=coll_args.user_id,
                          q=coll_args.q,
                          samplesize=coll_args.samplesize)
    collocs, cache_path = cache.get(cattr=coll_args.cattr,
                                    csortfn=coll_args.csortfn,
                                    cbgrfns=coll_args.cbgrfns,
                                    cfromw=coll_args.cfromw,
                                    ctow=coll_args.ctow,
                                    cminbgr=coll_args.cminbgr,
                                    cminfreq=coll_args.cminfreq)
    if collocs is None:
        coll_args.cache_path = cache_path
        worker = bgcalc.calc_backend_client(settings)
        res = worker.send_task('calculate_colls',
                               object.__class__,
                               args=(coll_args, ),
                               time_limit=TASK_TIME_LIMIT)
        # worker task caches the value AFTER the result is returned (see worker.py)
        ans = res.get()
    else:
        ans = dict(data=collocs, processing=0)
    return CalculateCollsResult(
        Head=ans['data']['Head'],
        attrname=coll_args.cattr,
        processing=ans['processing'],
        lastpage=not collstart + coll_args.citemsperpage < len(
            ans['data']['Items']),
        Items=ans['data']['Items'][collstart:collend])
Exemplo n.º 20
0
def build_arf_db(corp, attrname):
    """
    Provides a higher level wrapper to create_arf_db(). Function creates
    a background process where create_arf_db() is run.
    """
    base_path = corp_freqs_cache_path(corp, attrname)
    if calc_is_running(base_path):
        curr_status = _get_total_calc_status(base_path)
        if curr_status < 100:
            return curr_status

    subc_path = prepare_arf_calc_paths(corp, attrname)
    backend = settings.get('calc_backend', 'type')
    if backend in ('celery', 'konserver'):
        import bgcalc
        app = bgcalc.calc_backend_client(settings)
        task_ids = []
        for m in ('frq', 'arf', 'docf'):
            logfilename_m = create_log_path(base_path, m)
            write_log_header(corp, logfilename_m)
            res = app.send_task('worker.compile_{0}'.format(m),
                                (corp.corpname, subc_path, attrname, logfilename_m),
                                time_limit=TASK_TIME_LIMIT)
            task_ids.append(res.id)
        return task_ids

    elif backend == 'multiprocessing':
        import subprocess

        for m in ('frq', 'arf', 'docf'):
            logfilename_m = create_log_path(base_path, m)
            open(logfilename_m, 'w').write('%d\n%s\n0 %%' % (os.getpid(), corp.search_size()))
            log = " 2>> '%s'" % logfilename_m
            if subc_path:
                cmd = u"mkstats '%s' '%s' %%s '%s' %s" % (corp.get_confpath(), attrname,
                                                          subc_path.decode('utf-8'), log.decode('utf-8'))
                cmd = cmd.encode('utf-8')
            else:
                cmd = "mkstats '%s' '%s' %%s %s" % (corp.get_confpath(), attrname, log)
            subprocess.call(cmd % 'frq', shell=True)
        return []
Exemplo n.º 21
0
def build_arf_db(corp, attrname):
    """
    Provides a higher level wrapper to create_arf_db(). Function creates
    a background process where create_arf_db() is run.
    """
    base_path = corp_freqs_cache_path(corp, attrname)
    if calc_is_running(base_path):
        curr_status = _get_total_calc_status(base_path)
        if curr_status < 100:
            return curr_status

    subc_path = prepare_arf_calc_paths(corp, attrname)
    backend = settings.get('calc_backend', 'type')
    if backend in ('celery', 'konserver'):
        import bgcalc
        app = bgcalc.calc_backend_client(settings)
        task_ids = []
        for m in ('frq', 'arf', 'docf'):
            logfilename_m = create_log_path(base_path, m)
            write_log_header(corp, logfilename_m)
            res = app.send_task('worker.compile_{0}'.format(m),
                                (corp.corpname, subc_path, attrname, logfilename_m),
                                time_limit=TASK_TIME_LIMIT)
            task_ids.append(res.id)
        return task_ids

    elif backend == 'multiprocessing':
        import subprocess

        for m in ('frq', 'arf', 'docf'):
            logfilename_m = create_log_path(base_path, m)
            open(logfilename_m, 'w').write('%d\n%s\n0 %%' % (os.getpid(), corp.search_size()))
            log = " 2>> '%s'" % logfilename_m
            if subc_path:
                cmd = u"mkstats '%s' '%s' %%s '%s' %s" % (corp.get_confpath(), attrname,
                                                          subc_path.decode('utf-8'), log.decode('utf-8'))
                cmd = cmd.encode('utf-8')
            else:
                cmd = "mkstats '%s' '%s' %%s %s" % (corp.get_confpath(), attrname, log)
            subprocess.call(cmd % 'frq', shell=True)
        return []
Exemplo n.º 22
0
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize):
    """
    """
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    status = cache_map.get_calc_status(subchash, q)
    if not status or status.error:
        worker = bgcalc.calc_backend_client(settings)
        ans = worker.send_task(
            'conc_register',
            object.__class__,
            (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash,
             q, samplesize, TASK_TIME_LIMIT),
            time_limit=CONC_REGISTER_TASK_LIMIT)
        ans.get(timeout=CONC_REGISTER_WAIT_LIMIT)
    conc_avail = wait_for_conc(cache_map=cache_map,
                               subchash=subchash,
                               q=q,
                               minsize=minsize)
    if conc_avail:
        return PyConc(corp, 'l', cache_map.readable_cache_path(subchash, q))
    else:
        return InitialConc(corp, cache_map.readable_cache_path(subchash, q))
Exemplo n.º 23
0
def calculate_freqs_ct(args):
    """
    note: this is called by webserver
    """
    backend = settings.get('calc_backend', 'type')
    if backend in ('celery', 'konserver'):
        import bgcalc
        try:
            app = bgcalc.calc_backend_client(settings)
            res = app.send_task('worker.calculate_freqs_ct', args=(args.to_dict(),),
                                time_limit=TASK_TIME_LIMIT)
            calc_result = res.get()
        except Exception as ex:
            if is_celery_user_error(ex):
                raise UserActionException(ex.message)
            else:
                raise ex
    elif backend == 'multiprocessing':
        raise NotImplementedError(
            'Multi-processing backend is not yet supported for freq_ct calculation')
    else:
        raise ValueError('Invalid backend')
    return calc_result
Exemplo n.º 24
0
def calculate_freqs_ct(args):
    """
    note: this is called by webserver
    """
    backend = settings.get('calc_backend', 'type')
    if backend in ('celery', 'konserver'):
        import bgcalc
        try:
            app = bgcalc.calc_backend_client(settings)
            res = app.send_task('worker.calculate_freqs_ct', args=(args.to_dict(),),
                                time_limit=TASK_TIME_LIMIT)
            calc_result = res.get()
        except Exception as ex:
            if is_celery_user_error(ex):
                raise UserActionException(ex.message)
            else:
                raise ex
    elif backend == 'multiprocessing':
        raise NotImplementedError(
            'Multi-processing backend is not yet supported for freq_ct calculation')
    else:
        raise ValueError('Invalid backend')
    return calc_result
Exemplo n.º 25
0
def build_arf_db(corp, attrname):
    """
    Provides a higher level wrapper to create_arf_db(). Function creates
    a background process where create_arf_db() is run.
    """
    base_path = corp_freqs_cache_path(corp, attrname)
    if calc_is_running(base_path):
        curr_status = _get_total_calc_status(base_path)
        if curr_status < 100:
            return curr_status

    subc_path = prepare_arf_calc_paths(corp, attrname)
    app = bgcalc.calc_backend_client(settings)
    task_ids = []
    for m in ('frq', 'arf', 'docf'):
        logfilename_m = create_log_path(base_path, m)
        write_log_header(corp, logfilename_m)
        res = app.send_task(
            'compile_{0}'.format(m),
            (corp.corpname, subc_path, attrname, logfilename_m),
            time_limit=TASK_TIME_LIMIT)
        task_ids.append(res.id)
    return task_ids
Exemplo n.º 26
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        aligned_corpora = request.form.getlist('aligned_corpora')
        publish = bool(int(request.form.get('publish')))
        corpus_info = self.get_corpus_info(self.args.corpname)
        description = request.form.get('description')

        if not subcname:
            raise UserActionException(translate('No subcorpus name specified!'))

        if publish and not description:
            raise UserActionException(translate('No description specified'))

        if raw_cql:
            aligned_corpora = []
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql,)
        elif within_json:  # user entered a subcorpus query manually
            aligned_corpora = []
            tt_query = ()
            within_cql = self._deserialize_custom_within(json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql,)
        elif len(aligned_corpora) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists:
            if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                within_cql = None
                attrs = json.loads(request.form.get('attrs', '{}'))
                sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                    self._plugin_api, corpus=self.corp,
                    attr_map=attrs,
                    aligned_corpora=aligned_corpora,
                    limit_lists=False)
                values = sel_match['attr_values'][corpus_info.metadata.label_attr]
                args = argmapping.Args()
                setattr(args, 'sca_{0}'.format(
                    corpus_info.metadata.id_attr), [v[1] for v in values])
                tt_query = TextTypeCollector(self.corp, args).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                full_cql = import_string(full_cql, from_encoding=self.corp_encoding)
                imp_cql = (full_cql,)
            else:
                raise FunctionNotSupported(
                    'Corpus must have a bibliography item defined to support this function')
        else:
            within_cql = None
            tt_query = TextTypeCollector(self.corp, request).get_query()
            tmp = ['<%s %s />' % item for item in tt_query]
            full_cql = ' within '.join(tmp)
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql, from_encoding=self.corp_encoding)
            imp_cql = (full_cql,)

        basecorpname = self.args.corpname.split(':')[0]
        path = self.prepare_subc_path(basecorpname, subcname, publish=False)
        publish_path = self.prepare_subc_path(
            basecorpname, subcname, publish=True) if publish else None

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1 and len(aligned_corpora) == 0:
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0], tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path, self.session_get(
                    'user', 'fullname'), description)
        elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0:
            backend = settings.get('calc_backend', 'type')
            if backend in ('celery', 'konserver'):
                import bgcalc
                app = bgcalc.calc_backend_client(settings)
                res = app.send_task('worker.create_subcorpus',
                                    (self.session_get('user', 'id'), self.args.corpname, path, publish_path,
                                     tt_query, imp_cql, self.session_get('user', 'fullname'), description),
                                    time_limit=TASK_TIME_LIMIT)
                self._store_async_task(AsyncTaskStatus(status=res.status, ident=res.id,
                                                       category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                                                       label=u'%s:%s' % (basecorpname, subcname),
                                                       args=dict(subcname=subcname, corpname=basecorpname)))
                result = {}
            elif backend == 'multiprocessing':
                from bgcalc import subc_calc
                import functools
                import multiprocessing
                worker = subc_calc.CreateSubcorpusTask(user_id=self.session_get('user', 'id'),
                                                       corpus_id=self.args.corpname)
                multiprocessing.Process(target=functools.partial(
                    worker.run, tt_query, imp_cql, path, publish_path, description)).start()
                result = {}
        else:
            raise UserActionException(translate('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e)
                    self.add_system_message('warning',
                                            translate('Subcorpus created but there was a problem saving a backup copy.'))
            unfinished_corpora = filter(lambda at: not at.is_finished(),
                                        self.get_async_tasks(category=AsyncTaskStatus.CATEGORY_SUBCORPUS))
            return dict(processed_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(translate('Empty subcorpus!'))
Exemplo n.º 27
0
    def _create_subcorpus(self, request: Request) -> Dict[str, Any]:
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        within_cql = None
        form_type = request.json['form_type']

        if form_type == 'tt-sel':
            data = CreateSubcorpusArgs(**request.json)
            corpus_info = self.get_corpus_info(data.corpname)
            if (plugins.runtime.LIVE_ATTRIBUTES.exists and
                    plugins.runtime.LIVE_ATTRIBUTES.instance.is_enabled_for(
                        self._plugin_ctx, [data.corpname]
                    )  # TODO here we skip aligned corpora which is debatable
                    and len(data.aligned_corpora) > 0):
                if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                    within_cql = None
                    sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                        self._plugin_ctx,
                        corpus=self.corp,
                        attr_map=data.text_types,
                        aligned_corpora=data.aligned_corpora,
                        limit_lists=False)
                    sel_attrs = {}
                    for k, vals in sel_match.attr_values.items():
                        if k == corpus_info.metadata.label_attr:
                            k = corpus_info.metadata.id_attr
                        if '.' in k:
                            sel_attrs[k] = [v[1] for v in vals]
                    tt_query = TextTypeCollector(self.corp,
                                                 sel_attrs).get_query()
                    tmp = ['<%s %s />' % item for item in tt_query]
                    full_cql = ' within '.join(tmp)
                    full_cql = 'aword,[] within %s' % full_cql
                    imp_cql = (full_cql, )
                else:
                    raise FunctionNotSupported(
                        'Corpus must have a bibliography item defined to support this function'
                    )
            else:
                tt_query = TextTypeCollector(self.corp,
                                             data.text_types).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                imp_cql = (full_cql, )
        elif form_type == 'within':
            data = CreateSubcorpusWithinArgs(**request.json)
            tt_query = ()
            within_cql = self._deserialize_custom_within(data.within)
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql, )
        elif form_type == 'cql':
            data = CreateSubcorpusRawCQLArgs(**request.json)
            tt_query = ()
            within_cql = data.cql
            full_cql = f'aword,[] {data.cql}'
            imp_cql = (full_cql, )
        else:
            raise UserActionException(
                f'Invalid form type provided - "{form_type}"')

        if not data.subcname:
            raise UserActionException(
                translate('No subcorpus name specified!'))

        if data.publish and not data.description:
            raise UserActionException(translate('No description specified'))

        path = self.prepare_subc_path(self.args.corpname,
                                      data.subcname,
                                      publish=False)
        publish_path = self.prepare_subc_path(
            self.args.corpname, data.subcname,
            publish=True) if data.publish else None

        if len(tt_query) == 1 and not data.has_aligned_corpora():
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0],
                                              tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path,
                                         self.session_get('user', 'fullname'),
                                         data.description)
        elif len(tt_query) > 1 or within_cql or data.has_aligned_corpora():
            worker = bgcalc.calc_backend_client(settings)
            res = worker.send_task(
                'create_subcorpus',
                object.__class__,
                (self.session_get('user', 'id'), self.args.corpname, path,
                 publish_path, tt_query, imp_cql,
                 self.session_get('user', 'fullname'), data.description),
                time_limit=TASK_TIME_LIMIT)
            self._store_async_task(
                AsyncTaskStatus(status=res.status,
                                ident=res.id,
                                category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                                label=f'{self.args.corpname}/{data.subcname}',
                                args=dict(subcname=data.subcname,
                                          corpname=self.args.corpname)))
            result = {}
        else:
            raise UserActionException(translate('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=data.subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning(
                        'Failed to store subcorpus query: %s' % e)
                    self.add_system_message(
                        'warning',
                        translate(
                            'Subcorpus created but there was a problem saving a backup copy.'
                        ))
            unfinished_corpora = [
                at for at in self.get_async_tasks(
                    category=AsyncTaskStatus.CATEGORY_SUBCORPUS)
                if not at.is_finished()
            ]
            return dict(
                processed_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(translate('Empty subcorpus!'))
Exemplo n.º 28
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        aligned_corpora = request.form.getlist('aligned_corpora')
        publish = bool(int(request.form.get('publish')))
        corpus_info = self.get_corpus_info(self.args.corpname)
        description = request.form.get('description')

        if raw_cql:
            aligned_corpora = []
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql, )
        elif within_json:  # user entered a subcorpus query manually
            aligned_corpora = []
            tt_query = ()
            within_cql = self._deserialize_custom_within(
                json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql, )
        elif len(aligned_corpora
                 ) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists:
            if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                within_cql = None
                attrs = json.loads(request.form.get('attrs', '{}'))
                sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                    self._plugin_api,
                    corpus=self.corp,
                    attr_map=attrs,
                    aligned_corpora=aligned_corpora,
                    limit_lists=False)
                values = sel_match['attr_values'][
                    corpus_info.metadata.label_attr]
                args = argmapping.Args()
                setattr(args, 'sca_{0}'.format(corpus_info.metadata.id_attr),
                        [v[1] for v in values])
                tt_query = TextTypeCollector(self.corp, args).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                full_cql = import_string(full_cql,
                                         from_encoding=self.corp_encoding)
                imp_cql = (full_cql, )
            else:
                raise FunctionNotSupported(
                    'Corpus must have a bibliography item defined to support this function'
                )
        else:
            within_cql = None
            tt_query = TextTypeCollector(self.corp, request).get_query()
            tmp = ['<%s %s />' % item for item in tt_query]
            full_cql = ' within '.join(tmp)
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql,
                                     from_encoding=self.corp_encoding)
            imp_cql = (full_cql, )

        basecorpname = self.args.corpname.split(':')[0]
        if not subcname:
            raise UserActionException(
                translate('No subcorpus name specified!'))
        path = self.prepare_subc_path(basecorpname, subcname, publish=False)
        publish_path = self.prepare_subc_path(
            basecorpname, subcname, publish=True) if publish else None

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1 and len(aligned_corpora) == 0:
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0],
                                              tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path,
                                         self.session_get('user', 'fullname'),
                                         description)
        elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0:
            backend = settings.get('calc_backend', 'type')
            if backend in ('celery', 'konserver'):
                import bgcalc
                app = bgcalc.calc_backend_client(settings)
                res = app.send_task(
                    'worker.create_subcorpus',
                    (self.session_get('user', 'id'), self.args.corpname, path,
                     publish_path, tt_query, imp_cql, description),
                    time_limit=TASK_TIME_LIMIT)
                self._store_async_task(
                    AsyncTaskStatus(
                        status=res.status,
                        ident=res.id,
                        category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                        label=u'%s:%s' % (basecorpname, subcname),
                        args=dict(subcname=subcname, corpname=basecorpname)))
                result = {}
            elif backend == 'multiprocessing':
                from bgcalc import subc_calc
                import functools
                import multiprocessing
                worker = subc_calc.CreateSubcorpusTask(
                    user_id=self.session_get('user', 'id'),
                    corpus_id=self.args.corpname)
                multiprocessing.Process(target=functools.partial(
                    worker.run, tt_query, imp_cql, path, publish_path,
                    description)).start()
                result = {}
        else:
            raise UserActionException(translate('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning(
                        'Failed to store subcorpus query: %s' % e)
                    self.add_system_message(
                        'warning',
                        translate(
                            'Subcorpus created but there was a problem saving a backup copy.'
                        ))
            unfinished_corpora = filter(
                lambda at: not at.is_finished(),
                self.get_async_tasks(
                    category=AsyncTaskStatus.CATEGORY_SUBCORPUS))
            return dict(
                processed_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(translate('Empty subcorpus!'))