Пример #1
0
def calculate_colls_bg(coll_args):
    """
    Background collocations calculation.
    This function is expected to be run either
    from Celery or from other process (via multiprocessing).
    """
    cm = corplib.CorpusManager(subcpath=coll_args.subcpath)
    corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname)
    try:
        # try to fetch precalculated data; if none then MissingSubCorpFreqFile
        corplib.frq_db(corp, coll_args.cattr)
        conc = get_conc(corp=corp, user_id=coll_args.user_id, q=coll_args.q,
                        fromp=0, pagesize=0, asnc=0, save=coll_args.save, samplesize=coll_args.samplesize)
        if not conc.finished():
            raise UnfinishedConcordanceError(
                _('Cannot calculate yet - source concordance not finished. Please try again later.'))
        collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns,
                               cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq,
                               cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items)
        for item in collocs['Items']:
            item['pfilter'] = [('q2', item['pfilter'])]
            item['nfilter'] = [('q2', item['nfilter'])]
        return dict(data=collocs, processing=0, tasks=[])
    except corplib.MissingSubCorpFreqFile as e:
        ans = {'attrname': coll_args.cattr, 'tasks': []}
        out = freq_calc.build_arf_db(e.corpus, coll_args.cattr)
        if type(out) is list:
            processing = 1
            ans['tasks'].extend(out)
        else:
            processing = 0
        ans['processing'] = processing
        ans['data'] = dict(Items=[], Head=[])
        return ans
Пример #2
0
def calculate_colls_bg(coll_args):
    """
    Background collocations calculation.
    This function is expected to be run either
    from Celery or from other process (via multiprocessing).
    """
    cm = corplib.CorpusManager(subcpath=coll_args.subcpath)
    corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname)
    try:
        # try to fetch precalculated data; if none then MissingSubCorpFreqFile
        corplib.frq_db(corp, coll_args.cattr)
        conc = conclib.get_conc(corp=corp, user_id=coll_args.user_id, minsize=coll_args.minsize, q=coll_args.q,
                                fromp=0, pagesize=0, async=0, save=coll_args.save, samplesize=coll_args.samplesize)
        if not conc.finished():
            raise UnfinishedConcordanceError(
                _('Cannot calculate yet - source concordance not finished. Please try again later.'))
        collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns,
                               cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq,
                               cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items)
        for item in collocs['Items']:
            item['pfilter'] = [('q2', item['pfilter'])]
            item['nfilter'] = [('q2', item['nfilter'])]
            item['str'] = import_string(item['str'], from_encoding=coll_args.corpus_encoding)
        return dict(data=collocs, processing=0, tasks=[])
    except corplib.MissingSubCorpFreqFile as e:
        ans = {'attrname': coll_args.cattr, 'tasks': []}
        out = freq_calc.build_arf_db(e.args[0], coll_args.cattr)
        if type(out) is list:
            processing = 1
            ans['tasks'].extend(out)
        else:
            processing = 0
        ans['processing'] = processing
        ans['data'] = dict(Items=[], Head=[])
        return ans
Пример #3
0
    def submit(self, request):
        form_args = WordlistFormArgs()
        form_args.update_by_user_query(request.json)
        worker = calc_backend_client(settings)
        ans = dict(corpname=self.args.corpname, usesubcorp=self.args.usesubcorp,
                   freq_files_avail=True, subtasks=[])
        async_res = worker.send_task(
            'get_wordlist', object.__class__,
            args=(form_args.to_dict(), self.corp.size, self.session_get('user', 'id')))
        bg_result = async_res.get()
        if isinstance(bg_result, MissingSubCorpFreqFile):
            data_calc = freq_calc.build_arf_db(self.session_get('user', 'id'), self.corp, form_args.wlattr)
            if type(data_calc) is list:
                for subtask in data_calc:
                    self._store_async_task(subtask)
                    ans['subtasks'].append(subtask.to_dict())
                ans['freq_files_avail'] = False
            else:
                # TODO we should join the current calculation here instead of throwing an error
                raise WordlistError('The data calculation is already running')
        elif isinstance(bg_result, Exception):
            raise bg_result
        self._curr_wlform_args = form_args

        def on_conc_store(query_ids, history_ts, result):
            result['wl_query_id'] = query_ids[0]
            if history_ts:
                self._store_last_search('wlist', query_ids[0])

        self.on_conc_store = on_conc_store
        return ans
Пример #4
0
def calculate_colls_bg(coll_args: CollCalcArgs):
    """
    Background collocations calculation running on a worker server.
    In case auxiliary data files are needed and not present already
    (MissingSubCorpFreqFile exception), the function triggers
    a respective calculation.
    """
    cm = corplib.CorpusManager(subcpath=coll_args.subcpath)
    corp = cm.get_corpus(coll_args.corpname, subcname=coll_args.subcname)
    try:
        # try to fetch precalculated data; if none then MissingSubCorpFreqFile
        corplib.frq_db(corp, coll_args.cattr)
        conc = require_existing_conc(corp=corp, q=coll_args.q)
        if not conc.finished():
            raise UnfinishedConcordanceError(
                _('Cannot calculate yet - source concordance not finished. Please try again later.'
                  ))
        collocs = conc.collocs(cattr=coll_args.cattr,
                               csortfn=coll_args.csortfn,
                               cbgrfns=coll_args.cbgrfns,
                               cfromw=coll_args.cfromw,
                               ctow=coll_args.ctow,
                               cminfreq=coll_args.cminfreq,
                               cminbgr=coll_args.cminbgr,
                               max_lines=conc.size())
        for item in collocs['Items']:
            item['pfilter'] = {'q2': item['pfilter']}
            item['nfilter'] = {'q2': item['nfilter']}
        return dict(data=collocs, processing=0, tasks=[])
    except MissingSubCorpFreqFile:
        ans = {'attrname': coll_args.cattr, 'tasks': []}
        out = freq_calc.build_arf_db(corp, coll_args.cattr)
        if type(out) is list:
            processing = 1
            ans['tasks'].extend(out)
        else:
            processing = 0
        ans['processing'] = processing
        ans['data'] = dict(Items=[], Head=[])
        return ans
Пример #5
0
    def result(self, wlpat='', paginate=True, wlhash='', blhash=''):
        """
        """
        self.disabled_menu_items = (MainMenu.VIEW('kwic-sentence',
                                                  'structs-attrs'),
                                    MainMenu.FILTER, MainMenu.FREQUENCY,
                                    MainMenu.COLLOCATIONS,
                                    MainMenu.CONCORDANCE)
        if not wlpat:
            self.args.wlpat = '.*'
        if '.' in self.args.wlattr:
            orig_wlnums = self.args.wlnums
            self.args.wlnums = self._wlnums2structattr(self.args.wlnums)

        if paginate:
            wlmaxitems = self.args.wlpagesize * self.args.wlpage + 1
        else:
            wlmaxitems = sys.maxsize
        wlstart = (self.args.wlpage - 1) * self.args.wlpagesize
        result = {
            'reload_args':
            list({
                'corpname': self.args.corpname,
                'usesubcorp': self.args.usesubcorp,
                'wlattr': self.args.wlattr,
                'wlpat': self.args.wlpat,
                'wlminfreq': self.args.wlminfreq,
                'include_nonwords': self.args.include_nonwords,
                'wlsort': self.args.wlsort,
                'wlnums': self.args.wlnums
            }.items()),
            'form_args':
            dict(wlattr=self.args.wlattr,
                 wlpat=self.args.wlpat,
                 wlsort=self.args.wlsort,
                 subcnorm=self.args.subcnorm,
                 wltype=self.args.wltype,
                 wlnums=self.args.wlnums,
                 wlminfreq=self.args.wlminfreq,
                 wlwords=self.args.wlwords,
                 blacklist=self.args.blacklist,
                 wlFileName='',
                 blFileName='',
                 includeNonwords=self.args.include_nonwords)
        }
        try:
            if hasattr(self, 'wlfile') and self.args.wlpat == '.*':
                self.args.wlsort = ''

            white_words = self.args.wlwords
            black_words = self.args.blacklist

            if wlhash != '':
                white_words = self.load_bw_file(wlhash)

            if blhash != '':
                black_words = self.load_bw_file(blhash)

            whitelist = [w for w in re.split(r'\s+', white_words.strip()) if w]
            blacklist = [w for w in re.split(r'\s+', black_words.strip()) if w]

            if wlhash == '' and len(self.args.wlwords) > 0:
                wlhash = self.save_bw_file(self.args.wlwords)

            if blhash == '' and len(self.args.blacklist) > 0:
                blhash = self.save_bw_file(self.args.blacklist)

            result['reload_args'] = list({
                'corpname': self.args.corpname,
                'usesubcorp': self.args.usesubcorp,
                'wlattr': self.args.wlattr,
                'wlpat': self.args.wlpat,
                'wlminfreq': self.args.wlminfreq,
                'include_nonwords': self.args.include_nonwords,
                'wlsort': self.args.wlsort,
                'wlnums': self.args.wlnums,
                'wlhash': wlhash,
                'blhash': blhash
            }.items())

            result_list = corplib.wordlist(
                corp=self.corp,
                words=whitelist,
                wlattr=self.args.wlattr,
                wlpat=self.args.wlpat,
                wlminfreq=self.args.wlminfreq,
                wlmaxitems=wlmaxitems,
                wlsort=self.args.wlsort,
                blacklist=blacklist,
                wlnums=self.args.wlnums,
                include_nonwords=self.args.include_nonwords)[wlstart:]
            result['Items'] = result_list
            if len(result_list) < self.args.wlpagesize + 1:
                result['lastpage'] = 1
            else:
                result['lastpage'] = 0
                if paginate:
                    result_list = result_list[:-1]
            result['Items'] = result_list

            if '.' in self.args.wlattr:
                self.args.wlnums = orig_wlnums

            try:
                result['wlattr_label'] = (self.corp.get_conf(self.args.wlattr +
                                                             '.LABEL')
                                          or self.args.wlattr)
            except Exception as e:
                result['wlattr_label'] = self.args.wlattr
                logging.getLogger(__name__).warning(
                    'wlattr_label set failed: %s' % e)

            result['freq_figure'] = translate(
                self.FREQ_FIGURES.get(self.args.wlnums, '?'))
            result['processing'] = None

            self._add_save_menu_item(
                'CSV',
                save_format='csv',
                hint=translate(
                    'Saves at most {0} items. Use "Custom" for more options.'.
                    format(self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item(
                'XLSX',
                save_format='xlsx',
                hint=translate(
                    'Saves at most {0} items. Use "Custom" for more options.'.
                    format(self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item(
                'XML',
                save_format='xml',
                hint=translate(
                    'Saves at most {0} items. Use "Custom" for more options.'.
                    format(self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item(
                'TXT',
                save_format='text',
                hint=translate(
                    'Saves at most {0} items. Use "Custom" for more options.'.
                    format(self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item(translate('Custom'))
            # custom save is solved in templates because of compatibility issues
            result['tasks'] = []
            result['SubcorpList'] = []
            result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES
            self._export_subcorpora_list(self.args.corpname,
                                         self.args.usesubcorp, result)
            return result

        except corplib.MissingSubCorpFreqFile as e:
            result.update({'attrname': self.args.cattr, 'tasks': []})
            out = freq_calc.build_arf_db(e.corpus, self.args.wlattr)
            if type(out) is list:
                processing = 0
                result['tasks'].extend(out)
            elif out:
                processing = out
            else:
                processing = 0
            result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES
            result['wlattr'] = self.args.wlattr
            result['wlattr_label'] = ''
            result['processing'] = processing
            result['SubcorpList'] = []
            result['freq_figure'] = ''
            result['lastpage'] = None
            return result
Пример #6
0
    def result(self, wlpat='', paginate=True, wlhash='', blhash=''):
        """
        """
        self.disabled_menu_items = (MainMenu.VIEW('kwic-sentence', 'structs-attrs'),
                                    MainMenu.FILTER, MainMenu.FREQUENCY,
                                    MainMenu.COLLOCATIONS, MainMenu.CONCORDANCE)
        if not wlpat:
            self.args.wlpat = '.*'
        if '.' in self.args.wlattr:
            orig_wlnums = self.args.wlnums
            # TODO get rid of this retarded hidden deps rewriting (see the self.call_function piece of shit)
            self.args.wlnums = self._wlnums2structattr(self.args.wlnums)

        if paginate:
            wlmaxitems = self.args.wlpagesize * self.args.wlpage + 1
        else:
            wlmaxitems = sys.maxint
        wlstart = (self.args.wlpage - 1) * self.args.wlpagesize
        result = {
            'reload_args': {
                'corpname': self.args.corpname, 'usesubcorp': self.args.usesubcorp,
                'wlattr': self.args.wlattr, 'wlpat': self.args.wlpat,
                'wlminfreq': self.args.wlminfreq, 'include_nonwords': self.args.include_nonwords,
                'wlsort': self.args.wlsort, 'wlnums': self.args.wlnums}.items(),
            'form_args': dict(
                wlattr=self.args.wlattr, wlpat=self.args.wlpat, wlsort=self.args.wlsort,
                subcnorm=self.args.subcnorm, wltype=self.args.wltype, wlnums=self.args.wlnums,
                wlminfreq=self.args.wlminfreq, wlwords=self.args.wlwords, blacklist=self.args.blacklist,
                wlFileName='', blFileName='', includeNonwords=self.args.include_nonwords)
        }
        try:
            if hasattr(self, 'wlfile') and self.args.wlpat == '.*':
                self.args.wlsort = ''

            white_words = self.args.wlwords
            black_words = self.args.blacklist

            if wlhash != '':
                white_words = self.load_bw_file(wlhash)

            if blhash != '':
                black_words = self.load_bw_file(blhash)

            whitelist = [w for w in re.split('\s+', white_words.strip()) if w]
            blacklist = [w for w in re.split('\s+', black_words.strip()) if w]

            if wlhash == '' and len(self.args.wlwords) > 0:
                wlhash = self.save_bw_file(self.args.wlwords)

            if blhash == '' and len(self.args.blacklist) > 0:
                blhash = self.save_bw_file(self.args.blacklist)

            result['reload_args'] = {
                'corpname': self.args.corpname, 'usesubcorp': self.args.usesubcorp,
                'wlattr': self.args.wlattr, 'wlpat': self.args.wlpat,
                'wlminfreq': self.args.wlminfreq, 'include_nonwords': self.args.include_nonwords,
                'wlsort': self.args.wlsort, 'wlnums': self.args.wlnums,
                'wlhash': wlhash, 'blhash': blhash
            }.items()

            result_list = self.call_function(corplib.wordlist,
                                             (self.corp,),
                                             wlmaxitems=wlmaxitems,
                                             words=whitelist,
                                             blacklist=blacklist)[wlstart:]
            result['Items'] = result_list
            if len(result_list) < self.args.wlpagesize + 1:
                result['lastpage'] = 1
            else:
                result['lastpage'] = 0
                if paginate:
                    result_list = result_list[:-1]
            result['Items'] = result_list

            if '.' in self.args.wlattr:
                self.args.wlnums = orig_wlnums

            try:
                result['wlattr_label'] = (self.corp.get_conf(self.args.wlattr + '.LABEL') or
                                          self.args.wlattr)
            except Exception as e:
                result['wlattr_label'] = self.args.wlattr
                logging.getLogger(__name__).warning('wlattr_label set failed: %s' % e)

            result['freq_figure'] = translate(self.FREQ_FIGURES.get(self.args.wlnums, '?'))
            result['processing'] = None

            self._add_save_menu_item('CSV', save_format='csv',
                                     hint=translate('Saves at most {0} items. Use "Custom" for more options.'.format(
                                         self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item('XLSX', save_format='xlsx',
                                     hint=translate('Saves at most {0} items. Use "Custom" for more options.'.format(
                                         self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item('XML', save_format='xml',
                                     hint=translate('Saves at most {0} items. Use "Custom" for more options.'.format(
                                         self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item('TXT', save_format='text',
                                     hint=translate('Saves at most {0} items. Use "Custom" for more options.'.format(
                                         self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item(translate('Custom'))
            # custom save is solved in templates because of compatibility issues
            result['tasks'] = []
            result['SubcorpList'] = []
            result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES
            self._export_subcorpora_list(self.args.corpname, self.args.usesubcorp, result)
            return result

        except corplib.MissingSubCorpFreqFile as e:
            result.update({'attrname': self.args.cattr, 'tasks': []})
            out = freq_calc.build_arf_db(e.corpus, self.args.wlattr)
            if type(out) is list:
                processing = 0
                result['tasks'].extend(out)
            elif out:
                processing = out
            else:
                processing = 0
            result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES
            result['wlattr'] = self.args.wlattr
            result['wlattr_label'] = ''
            result['processing'] = processing
            result['SubcorpList'] = []
            result['freq_figure'] = ''
            result['lastpage'] = None
            return result