def calculate_colls_bg(coll_args): """ Background collocations calculation. This function is expected to be run either from Celery or from other process (via multiprocessing). """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname) try: # try to fetch precalculated data; if none then MissingSubCorpFreqFile corplib.frq_db(corp, coll_args.cattr) conc = get_conc(corp=corp, user_id=coll_args.user_id, q=coll_args.q, fromp=0, pagesize=0, asnc=0, save=coll_args.save, samplesize=coll_args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.')) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items) for item in collocs['Items']: item['pfilter'] = [('q2', item['pfilter'])] item['nfilter'] = [('q2', item['nfilter'])] return dict(data=collocs, processing=0, tasks=[]) except corplib.MissingSubCorpFreqFile as e: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(e.corpus, coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def calculate_colls_bg(coll_args): """ Background collocations calculation. This function is expected to be run either from Celery or from other process (via multiprocessing). """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname) try: # try to fetch precalculated data; if none then MissingSubCorpFreqFile corplib.frq_db(corp, coll_args.cattr) conc = conclib.get_conc(corp=corp, user_id=coll_args.user_id, minsize=coll_args.minsize, q=coll_args.q, fromp=0, pagesize=0, async=0, save=coll_args.save, samplesize=coll_args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.')) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items) for item in collocs['Items']: item['pfilter'] = [('q2', item['pfilter'])] item['nfilter'] = [('q2', item['nfilter'])] item['str'] = import_string(item['str'], from_encoding=coll_args.corpus_encoding) return dict(data=collocs, processing=0, tasks=[]) except corplib.MissingSubCorpFreqFile as e: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(e.args[0], coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def calculate_colls_bg(coll_args: CollCalcArgs): """ Background collocations calculation running on a worker server. In case auxiliary data files are needed and not present already (MissingSubCorpFreqFile exception), the function triggers a respective calculation. """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_corpus(coll_args.corpname, subcname=coll_args.subcname) try: # try to fetch precalculated data; if none then MissingSubCorpFreqFile corplib.frq_db(corp, coll_args.cattr) conc = require_existing_conc(corp=corp, q=coll_args.q) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.' )) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=conc.size()) for item in collocs['Items']: item['pfilter'] = {'q2': item['pfilter']} item['nfilter'] = {'q2': item['nfilter']} return dict(data=collocs, processing=0, tasks=[]) except MissingSubCorpFreqFile: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(corp, coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def _get_attrfreq(corp: KCorpus, attr, wlattr, wlnums): if '.' in wlattr: # attribute of a structure struct = corp.get_struct(wlattr.split('.')[0]) if wlnums == 'doc sizes': normvals = dict([(struct.beg(i), struct.end(i) - struct.beg(i)) for i in range(struct.size())]) else: normvals = dict([(struct.beg(i), 1) for i in range(struct.size())]) attrfreq = dict([(i, doc_sizes(corp, struct, wlattr, i, normvals)) for i in range(attr.id_range())]) else: # positional attribute attrfreq = frq_db(corp, wlattr, wlnums) return attrfreq