def calc_freqs_bg(args): """ Calculate actual frequency data. arguments: args -- a FreqCalsArgs instance returns: a dict(freqs=..., conc_size=...) """ cm = corplib.CorpusManager(subcpath=args.subcpath) corp = cm.get_Corpus(args.corpname, args.subcname) conc = conclib.get_conc(corp=corp, user_id=args.user_id, minsize=args.minsize, q=args.q, fromp=args.fromp, pagesize=args.pagesize, async=0, save=args.save, samplesize=args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.' )) freqs = [ conc.xfreq_dist(cr, args.flimit, args.freq_sort, args.ml, args.ftt_include_empty, args.rel_mode, args.collator_locale) for cr in args.fcrit ] return dict(freqs=freqs, conc_size=conc.size())
def calculate_colls_bg(coll_args): """ Background collocations calculation. This function is expected to be run either from Celery or from other process (via multiprocessing). """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname) try: # try to fetch precalculated data; if none then MissingSubCorpFreqFile corplib.frq_db(corp, coll_args.cattr) conc = get_conc(corp=corp, user_id=coll_args.user_id, q=coll_args.q, fromp=0, pagesize=0, asnc=0, save=coll_args.save, samplesize=coll_args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.')) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items) for item in collocs['Items']: item['pfilter'] = [('q2', item['pfilter'])] item['nfilter'] = [('q2', item['nfilter'])] return dict(data=collocs, processing=0, tasks=[]) except corplib.MissingSubCorpFreqFile as e: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(e.corpus, coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def run(self): """ note: this is called by Celery worker """ cm = corplib.CorpusManager(subcpath=self._args.subcpath) self._corp = cm.get_Corpus(self._args.corpname, subcname=self._args.subcname) self._conc = conclib.get_conc(corp=self._corp, user_id=self._args.user_id, minsize=self._args.minsize, q=self._args.q, fromp=0, pagesize=0, async=0, save=0, samplesize=0) result, full_size = self.ct_dist(self._args.fcrit, limit=self._args.ctminfreq, limit_type=self._args.ctminfreq_type) return dict(data=[x[0] + x[1:] for x in result], full_size=full_size)
def form(self, request): """ Word List Form """ self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER, MainMenu.FREQUENCY, MainMenu.COLLOCATIONS, MainMenu.SAVE, MainMenu.CONCORDANCE) out = {} ref_corpname = request.args.get('ref_corpname', self.args.corpname) refcm = corplib.CorpusManager(self.subcpath) out['RefSubcorp'] = refcm.subcorp_names(ref_corpname) out['ref_corpname'] = ref_corpname out['freq_figures'] = self.FREQ_FIGURES self._export_subcorpora_list(self.args.corpname, out) return out
def export_tags(self, corpname): """ Parameters ---------- corpname : str a name of the corpus we want to extract tags from Returns ------- tuple unique list of all found tags """ import corplib cm = corplib.CorpusManager() corpus = cm.get_Corpus(corpname) return self.generate_kwiclines('[tag=".*"]', corpus)
def calculate_colls_bg(coll_args: CollCalcArgs): """ Background collocations calculation running on a worker server. In case auxiliary data files are needed and not present already (MissingSubCorpFreqFile exception), the function triggers a respective calculation. """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_corpus(coll_args.corpname, subcname=coll_args.subcname) try: # try to fetch precalculated data; if none then MissingSubCorpFreqFile corplib.frq_db(corp, coll_args.cattr) conc = require_existing_conc(corp=corp, q=coll_args.q) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.' )) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=conc.size()) for item in collocs['Items']: item['pfilter'] = {'q2': item['pfilter']} item['nfilter'] = {'q2': item['nfilter']} return dict(data=collocs, processing=0, tasks=[]) except MissingSubCorpFreqFile: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(corp, coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def __init__(self, user_id, corpus_id, author, description): self._user_id = user_id self._cm = corplib.CorpusManager() self._corp = self._cm.get_Corpus(corpus_id) self._author = author self._description = description
def __init__(self, user_id, corpus_id): self._user_id = user_id self._cm = corplib.CorpusManager() self._corp = self._cm.get_Corpus(corpus_id)
def calc_freqs(self, flimit, freq_sort, ml, rel_mode, fcrit, ftt_include_empty, collator_locale, fmaxitems, fpage, line_offset): """ Calculate actual frequency data. Returns: a 2-tuple (freq_data, caching_data) where: freq_data = dict(lastpage=..., data=..., fstart=..., fmaxitems=..., conc_size=...) caching_data = dict(data=..., cache_path=...); can be also None which means 'do not cache' """ cache_path = self._cache_file_path(fcrit, flimit, freq_sort, ml, ftt_include_empty, rel_mode, collator_locale) cache_ans = None if os.path.isfile(cache_path): with open(cache_path, 'rb') as f: data, conc_size = cPickle.load(f) else: cm = corplib.CorpusManager(subcpath=self._subcpath) corp = cm.get_Corpus(self._corpname, self._subcname) conc = conclib.get_conc(corp=corp, user_id=self._user_id, minsize=self._minsize, q=self._q, fromp=self._fromp, pagesize=self._pagesize, async=0, save=self._save, samplesize=self._samplesize) conc_size = conc.size() data = [ conc.xfreq_dist(cr, flimit, freq_sort, ml, ftt_include_empty, rel_mode, collator_locale) for cr in fcrit ] lastpage = None if len(data) == 1: # a single block => pagination total_length = len(data[0]['Items']) if total_length >= self.min_cached_data_size: cache_ans = dict(data=(data, conc_size), cache_path=cache_path) items_per_page = fmaxitems fstart = (fpage - 1) * fmaxitems + line_offset fmaxitems = fmaxitems * fpage + 1 + line_offset if total_length < fmaxitems: lastpage = 1 else: lastpage = 0 ans = [ dict(Total=total_length, TotalPages=int( math.ceil(total_length / float(items_per_page))), Items=data[0]['Items'][fstart:fmaxitems - 1], Head=data[0]['Head']) ] else: ans = data fstart = None return dict(lastpage=lastpage, data=ans, fstart=fstart, fmaxitems=fmaxitems, conc_size=conc_size), cache_ans