def calculate_freqs(args): args = freq_calc.FreqCalsArgs(**args) calculate_freqs.cache_path = args.cache_path ans = freq_calc.calc_freqs_bg(args) trigger_cache_limit = settings.get_int('corpora', 'freqs_cache_min_lines', 10) if args.force_cache or max(len(d.get('Items', ())) for d in ans['freqs']) >= trigger_cache_limit: calculate_freqs.cache_data = ans else: calculate_freqs.cache_data = None return ans
def export_with_norms(self, subcorpattrs='', format_num=True, ret_nums=True, subcnorm='tokens'): """ Returns a text types table containing also an information about total occurrences of respective attribute values. See corplib.texttype_values for arguments and returned value """ ans = {} if not subcorpattrs: subcorpattrs = self._corp.get_conf('SUBCORPATTRS') if not subcorpattrs: subcorpattrs = self._corp.get_conf('FULLREF') if not subcorpattrs or subcorpattrs == '#': raise TextTypesException( _('Missing display configuration of structural attributes (SUBCORPATTRS or FULLREF).')) corpus_info = plugins.get('corparch').get_corpus_info(self._corpname) maxlistsize = settings.get_int('global', 'max_attr_list_size') # if 'live_attributes' are installed then always shrink bibliographical # entries even if their count is < maxlistsize subcorp_attr_list = re.split(r'\s*[,|]\s*', subcorpattrs) if plugins.has_plugin('live_attributes'): ans['bib_attr'] = corpus_info['metadata']['label_attr'] list_none = (ans['bib_attr'], ) tmp = [s for s in subcorp_attr_list] # making copy here if ans['bib_attr'] and ans['bib_attr'] not in tmp: # if bib type is not in subcorpattrs tmp.append(ans['bib_attr']) # we add it there subcorpattrs = '|'.join(tmp) # we ignore NoSkE '|' vs. ',' stuff deliberately here else: ans['bib_attr'] = None list_none = () tt = self._tt_cache.get_values(corp=self._corp, subcorpattrs=subcorpattrs, maxlistsize=maxlistsize, shrink_list=list_none, collator_locale=corpus_info.collator_locale) self._add_tt_custom_metadata(tt) if ret_nums: struct_calc = collections.OrderedDict() for item in subcorp_attr_list: k = item.split('.')[0] struct_calc[k] = CachedStructNormsCalc(self._corp, k, subcnorm, db=plugins.get('db')) for col in reduce(lambda p, c: p + c['Line'], tt, []): if 'textboxlength' not in col: structname, attrname = col['name'].split('.') for val in col['Values']: v = struct_calc[structname].compute_norm(attrname, val['v']) val['xcnt'] = l10n.format_number(v) if format_num else v ans['Blocks'] = tt ans['Normslist'] = self._get_normslist(struct_calc.keys()[0]) else: ans['Blocks'] = tt ans['Normslist'] = [] return ans
def text_types(self): ans = {} maxlistsize = settings.get_int('global', 'max_attr_list_size') subcorpattrs = self.current_corpus.get_conf('SUBCORPATTRS') if not subcorpattrs: subcorpattrs = self.current_corpus.get_conf('FULLREF') tt = get_tt(self.current_corpus, self).export(subcorpattrs, maxlistsize) for item in tt: for tt2 in item['Line']: ans[tt2['name']] = {'type': 'default', 'values': [x['v'] for x in tt2.get('Values', [])]} return ans
def calculate_colls(coll_args): """ arguments: coll_args -- dict-serialized coll_calc.CollCalcArgs """ coll_args = coll_calc.CollCalcArgs(**coll_args) calculate_colls.cache_path = coll_args.cache_path ans = coll_calc.calculate_colls_bg(coll_args) trigger_cache_limit = settings.get_int('corpora', 'colls_cache_min_lines', 10) if not ans['processing'] and len(ans['data']['Items']) >= trigger_cache_limit: calculate_colls.cache_data = ans['data'] else: calculate_colls.cache_data = None return ans
def smtp_factory(): """ Create a new SMTP instance with some predefined stuff :return: """ username = settings.get('mailing', 'auth_username') password = settings.get('mailing', 'auth_password') port = settings.get_int('mailing', 'smtp_port', 25) use_tls = settings.get_bool('mailing', 'use_tls', False) server = smtplib.SMTP(settings.get('mailing', 'smtp_server'), port=port) if use_tls: server.starttls() if username and password: server.login(username, password) return server
def calculate_colls_mp(coll_args): """ Background calculation of collocations using 'multiprocessing' package. """ import multiprocessing def cache_results(cache_path, data): with open(cache_path, 'wb') as f: pickle.dump(data, f) ans = calculate_colls_bg(coll_args) if len(ans['Items']) >= settings.get_int('corpora', 'colls_cache_min_lines', 10): # cache only if its worth it multiprocessing.Process(target=cache_results, args=(coll_args.cache_path, ans,)).start() return ans
def clean_colls_cache(): root_dir = settings.get('corpora', 'colls_cache_dir') cache_ttl = settings.get_int('corpora', 'colls_cache_ttl', 3600) test_time = time.time() all_files = os.listdir(root_dir) num_removed = 0 num_error = 0 for item in all_files: file_path = os.path.join(root_dir, item) if test_time - os.path.getmtime(file_path) >= cache_ttl: try: os.unlink(file_path) num_removed += 1 except OSError: num_error += 1 return dict(total_files=len(all_files), num_removed=num_removed, num_error=num_error)
def clean_freqs_cache(): root_dir = settings.get('corpora', 'freqs_cache_dir') cache_ttl = settings.get_int('corpora', 'freqs_cache_ttl', 3600) test_time = time.time() all_files = os.listdir(root_dir) num_removed = 0 num_error = 0 for item in all_files: file_path = os.path.join(root_dir, item) if test_time - os.path.getmtime(file_path) >= cache_ttl: try: os.unlink(file_path) num_removed += 1 except OSError: num_error += 1 return dict(total_files=len(all_files), num_removed=num_removed, num_error=num_error)
def calculate_colls_mp(coll_args): """ Background calculation of collocations using 'multiprocessing' package. """ import multiprocessing def cache_results(cache_path, data): with open(cache_path, 'wb') as f: pickle.dump(data, f) ans = calculate_colls_bg(coll_args) if len(ans['Items']) >= settings.get_int('corpora', 'colls_cache_min_lines', 10): # cache only if its worth it multiprocessing.Process(target=cache_results, args=( coll_args.cache_path, ans, )).start() return ans
def fcs_search(self, corp: KCorpus, corpname, fcs_query, max_rec, start): """ aux function for federated content search: operation=searchRetrieve """ query = fcs_query.replace('+', ' ') # convert URL spaces exact_match = True # attr=".*value.*" if 'exact' in query.lower() and '=' not in query: # lemma EXACT "dog" pos = query.lower().index('exact') # first occurrence of EXACT query = query[:pos] + '=' + query[pos + 5:] # 1st exact > = exact_match = True attrs = corp.get_posattrs() # list of available attrs try: # parse query if '=' in query: # lemma=word | lemma="word" | lemma="w1 w2" | word="" attr, term = query.split('=') attr = attr.strip() term = term.strip() else: # "w1 w2" | "word" | word attr = 'word' # use one of search attributes if in corpora attributes # otherwise use `word` - fails below if not valid for sa in self.search_attrs: if sa in attrs: attr = sa break term = query.strip() if '"' in attr: raise Exception if '"' in term: # "word" | "word1 word2" | "" | "it is \"good\"" if term[0] != '"' or term[-1] != '"': # check q. marks raise Exception term = term[1:-1].strip() # remove quotation marks if ' ' in term: # multi-word term if exact_match: rq = ' '.join( ['[%s="%s"]' % (attr, t) for t in term.split()]) else: rq = ' '.join([ '[%s=".*%s.*"]' % (attr, t) for t in term.split() ]) elif term.strip() == '': # "" raise Exception # empty term else: # one-word term if exact_match: rq = '[%s="%s"]' % (attr, term) else: rq = '[%s=".*%s.*"]' % (attr, term) else: # must be single-word term if ' ' in term: raise Exception if exact_match: # build query rq = '[%s="%s"]' % (attr, term) else: rq = '[%s=".*%s.*"]' % (attr, term) except: # there was a problem when parsing raise Exception(10, query, 'Query syntax error') if attr not in attrs: raise Exception(16, attr, 'Unsupported index') fromp = int(math.floor((start - 1) / max_rec)) + 1 # try to get concordance try: anon_id = plugins.runtime.AUTH.instance.anonymous_user()['id'] q = ['q' + rq] conc = get_conc(corp, anon_id, q=q, fromp=fromp, pagesize=max_rec, asnc=0) except Exception as e: raise Exception(10, repr(e), 'Query syntax error') kwic = kwiclib.Kwic(corp, corpname, conc) kwic_args = kwiclib.KwicPageArgs({'structs': ''}, base_attr=Kontext.BASE_ATTR) kwic_args.fromp = fromp kwic_args.pagesize = max_rec kwic_args.leftctx = '-{0}'.format( settings.get_int('fcs', 'kwic_context', 5)) kwic_args.rightctx = '{0}'.format( settings.get_int('fcs', 'kwic_context', 5)) page = kwic.kwicpage(kwic_args) # convert concordance local_offset = (start - 1) % max_rec if start - 1 > conc.size(): raise Exception(61, 'startRecord', 'First record position out of range') rows = [(kwicline['Left'][0]['str'], kwicline['Kwic'][0]['str'], kwicline['Right'][0]['str'], kwicline['ref']) for kwicline in page['Lines'] ][local_offset:local_offset + max_rec] return rows, conc.size()
def export_with_norms(self, subcorpattrs='', ret_nums=True, subcnorm='tokens'): """ Returns a text types table containing also an information about total occurrences of respective attribute values. See corplib.texttype_values for arguments and returned value """ ans = {} if not subcorpattrs: subcorpattrs = self._corp.get_conf('SUBCORPATTRS') if not subcorpattrs: subcorpattrs = self._corp.get_conf('FULLREF') if not subcorpattrs or subcorpattrs == '#': raise TextTypesException( _('Missing display configuration of structural attributes (SUBCORPATTRS or FULLREF).')) corpus_info = plugins.runtime.CORPARCH.instance.get_corpus_info( self._plugin_api.user_lang, self._corpname) maxlistsize = settings.get_int('global', 'max_attr_list_size') # if 'live_attributes' are installed then always shrink bibliographical # entries even if their count is < maxlistsize subcorp_attr_list_tmp = re.split(r'\s*[,|]\s*', subcorpattrs) subcorp_attr_list = collections.OrderedDict( zip(subcorp_attr_list_tmp, [None] * len(subcorp_attr_list_tmp))).keys() subcorpattrs = '|'.join(subcorp_attr_list) if len(subcorp_attr_list_tmp) != len(subcorp_attr_list): logging.getLogger(__name__).warning('Duplicate SUBCORPATTRS item found') if plugins.runtime.LIVE_ATTRIBUTES.exists: ans['bib_attr'] = corpus_info['metadata']['label_attr'] ans['id_attr'] = corpus_info['metadata']['id_attr'] # We have to ensure that the bibliography item (which uses different values # for labels and different values for actual identifiers) is represented # as an input box on client-side. Passing list_none with bib_attr element # to get_values()'s shrink_list ensures this. # Please see public/files/js/stores/textTypes/attrValues.ts for more information # on how is bibliography attr. box handled on client. list_none = (ans['bib_attr'], ) tmp = [s for s in subcorp_attr_list] # making copy here if ans['bib_attr'] and ans['bib_attr'] not in tmp: # if bib type is not in subcorpattrs tmp.append(ans['bib_attr']) # we add it there subcorpattrs = '|'.join(tmp) # we ignore NoSkE '|' vs. ',' stuff deliberately here else: ans['bib_attr'] = None ans['id_attr'] = None list_none = () tt = self._tt_cache.get_values(corp=self._corp, subcorpattrs=subcorpattrs, maxlistsize=maxlistsize, shrink_list=list_none, collator_locale=corpus_info.collator_locale) self._add_tt_custom_metadata(tt) if ret_nums: struct_calc = collections.OrderedDict() for item in subcorp_attr_list: k = item.split('.')[0] struct_calc[k] = CachedStructNormsCalc(self._corp, k, subcnorm, self._tt_cache) cache_ok = True for col in reduce(lambda p, c: p + c['Line'], tt, []): if 'textboxlength' not in col: structname, attrname = col['name'].split('.') for val in col['Values']: try: v = struct_calc[structname].compute_norm(attrname, val['v']) except KeyError: v = 0 # no problem here as the value is actually not required by subcorpattrs cache_ok = False val['xcnt'] = v if not cache_ok: self._tt_cache.clear(self._corp) logging.getLogger(__name__).warning( 'Removed invalid tt cache entry for corpus {0}'.format(self._corpname)) ans['Blocks'] = tt ans['Normslist'] = self._get_normslist(list(struct_calc.keys())[0]) else: ans['Blocks'] = tt ans['Normslist'] = [] return ans
except ImportError: import pickle import hashlib import os import time import corplib import conclib from bgcalc import freq_calc from l10n import import_string import settings from structures import FixedDict from bgcalc import UnfinishedConcordanceError from translation import ugettext as _ TASK_TIME_LIMIT = settings.get_int('global', 'calc_backend_time_limit', 300) class CollCalcArgs(FixedDict): """ Collects all the required arguments passed around when calculating collocation profiles. """ q = None user_id = None corpname = None corpus_encoding = None subcname = None subcpath = None num_lines = None collpage = None
def test_get_int_float_val(self): with self.assertRaises(ValueError): settings.get_int('global', 'weight')
def test_get_int_default(self): v = settings.get_int('global', 'zzz', 10) self.assertEqual(v, 10)
def min_cached_data_size(self): return settings.get_int('corpora', 'freqs_cache_min_lines', FreqCalc.DEFAULT_MIN_CACHED_FILE_ITEMS)
def _user_is_anonymous(self): return self._session_get('user', 'id') == settings.get_int('global', 'anonymous_user_id')
import pickle import hashlib import os import time import corplib from conclib.search import get_conc from bgcalc import freq_calc import settings from structures import FixedDict from bgcalc import UnfinishedConcordanceError from translation import ugettext as _ import bgcalc TASK_TIME_LIMIT = settings.get_int('calc_backend', 'task_time_limit', 300) class CollCalcArgs(FixedDict): """ Collects all the required arguments passed around when calculating collocation profiles. """ q = None user_id = None corpname = None corpus_encoding = None subcname = None subcpath = None num_lines = None collpage = None
def export_with_norms(self, subcorpattrs='', ret_nums=True, subcnorm='tokens'): """ Returns a text types table containing also an information about total occurrences of respective attribute values. See corplib.texttype_values for arguments and returned value """ ans = {} if not subcorpattrs: subcorpattrs = self._corp.get_conf('SUBCORPATTRS') if not subcorpattrs: subcorpattrs = self._corp.get_conf('FULLREF') if not subcorpattrs or subcorpattrs == '#': raise TextTypesException( _('Missing display configuration of structural attributes (SUBCORPATTRS or FULLREF).')) corpus_info = plugins.runtime.CORPARCH.instance.get_corpus_info(self._plugin_api.user_lang, self._corpname) maxlistsize = settings.get_int('global', 'max_attr_list_size') # if 'live_attributes' are installed then always shrink bibliographical # entries even if their count is < maxlistsize subcorp_attr_list_tmp = re.split(r'\s*[,|]\s*', subcorpattrs) subcorp_attr_list = collections.OrderedDict(zip(subcorp_attr_list_tmp, [None]*len(subcorp_attr_list_tmp))).keys() subcorpattrs = '|'.join(subcorp_attr_list) if len(subcorp_attr_list_tmp) != len(subcorp_attr_list): logging.getLogger(__name__).warning('Duplicate SUBCORPATTRS item found') if plugins.runtime.LIVE_ATTRIBUTES.exists: ans['bib_attr'] = corpus_info['metadata']['label_attr'] ans['id_attr'] = corpus_info['metadata']['id_attr'] # We have to ensure that the bibliography item (which uses different values # for labels and different values for actual identifiers) is represented # as an input box on client-side. Passing list_none with bib_attr element # to get_values()'s shrink_list ensures this. # Please see public/files/js/stores/textTypes/attrValues.ts for more information # on how is bibliography attr. box handled on client. list_none = (ans['bib_attr'], ) tmp = [s for s in subcorp_attr_list] # making copy here if ans['bib_attr'] and ans['bib_attr'] not in tmp: # if bib type is not in subcorpattrs tmp.append(ans['bib_attr']) # we add it there subcorpattrs = '|'.join(tmp) # we ignore NoSkE '|' vs. ',' stuff deliberately here else: ans['bib_attr'] = None ans['id_attr'] = None list_none = () tt = self._tt_cache.get_values(corp=self._corp, subcorpattrs=subcorpattrs, maxlistsize=maxlistsize, shrink_list=list_none, collator_locale=corpus_info.collator_locale) self._add_tt_custom_metadata(tt) if ret_nums: struct_calc = collections.OrderedDict() for item in subcorp_attr_list: k = item.split('.')[0] struct_calc[k] = CachedStructNormsCalc(self._corp, k, subcnorm, db=plugins.runtime.DB.instance) for col in reduce(lambda p, c: p + c['Line'], tt, []): if 'textboxlength' not in col: structname, attrname = col['name'].split('.') for val in col['Values']: v = struct_calc[structname].compute_norm(attrname, val['v']) val['xcnt'] = v ans['Blocks'] = tt ans['Normslist'] = self._get_normslist(struct_calc.keys()[0]) else: ans['Blocks'] = tt ans['Normslist'] = [] return ans
def test_get_int_default_type_any(self): v = settings.get_int('global', 'zzz', '10') self.assertEqual(v, 10)
def test_get_int_non_parseable_str_val(self): with self.assertRaises(ValueError): settings.get_int('global', 'foo')
from controller import exposed from controller.errors import FunctionNotSupported, UserActionException from controller.kontext import AsyncTaskStatus from controller.querying import Querying from main_menu import MainMenu from translation import ugettext as translate import plugins import l10n from l10n import import_string import corplib from texttypes import TextTypeCollector, get_tt import settings import argmapping TASK_TIME_LIMIT = settings.get_int('calc_backend', 'task_time_limit', 300) class SubcorpusError(Exception): pass class Subcorpus(Querying): def __init__(self, request, ui_lang): super(Subcorpus, self).__init__(request, ui_lang) def get_mapping_url_prefix(self): return '/subcorpus/' def prepare_subc_path(self, corpname, subcname, publish):
def test_get_int(self): v = settings.get_int('global', 'height') self.assertEqual(v, 1000)
def fcs_search(self, corp, corpname, fcs_query, max_rec, start): """ aux function for federated content search: operation=searchRetrieve """ query = fcs_query.replace('+', ' ') # convert URL spaces exact_match = True # attr=".*value.*" if 'exact' in query.lower() and '=' not in query: # lemma EXACT "dog" pos = query.lower().index('exact') # first occurrence of EXACT query = query[:pos] + '=' + query[pos + 5:] # 1st exact > = exact_match = True attrs = corp.get_conf('ATTRLIST').split(',') # list of available attrs rq = '' # query for manatee try: # parse query if '=' in query: # lemma=word | lemma="word" | lemma="w1 w2" | word="" attr, term = query.split('=') attr = attr.strip() term = term.strip() else: # "w1 w2" | "word" | word attr = 'word' # use one of search attributes if in corpora attributes # otherwise use `word` - fails below if not valid for sa in self.search_attrs: if sa in attrs: attr = sa break term = query.strip() if '"' in attr: raise Exception if '"' in term: # "word" | "word1 word2" | "" | "it is \"good\"" if term[0] != '"' or term[-1] != '"': # check q. marks raise Exception term = term[1:-1].strip() # remove quotation marks if ' ' in term: # multi-word term if exact_match: rq = ' '.join(['[%s="%s"]' % (attr, t) for t in term.split()]) else: rq = ' '.join(['[%s=".*%s.*"]' % (attr, t) for t in term.split()]) elif term.strip() == '': # "" raise Exception # empty term else: # one-word term if exact_match: rq = '[%s="%s"]' % (attr, term) else: rq = '[%s=".*%s.*"]' % (attr, term) else: # must be single-word term if ' ' in term: raise Exception if exact_match: # build query rq = '[%s="%s"]' % (attr, term) else: rq = '[%s=".*%s.*"]' % (attr, term) except: # there was a problem when parsing raise Exception(10, query, 'Query syntax error') if attr not in attrs: raise Exception(16, attr, 'Unsupported index') fromp = int(math.floor((start - 1) / max_rec)) + 1 # try to get concordance try: anon_id = plugins.runtime.AUTH.instance.anonymous_user()['id'] q = ['q' + rq] conc = conclib.get_conc(corp, anon_id, q=q, fromp=fromp, pagesize=max_rec * 2, async=0) except Exception as e: raise Exception(10, repr(e), 'Query syntax error') kwic = kwiclib.Kwic(corp, corpname, conc) kwic_args = kwiclib.KwicPageArgs(Args(), base_attr=Kontext.BASE_ATTR) kwic_args.fromp = fromp kwic_args.pagesize = max_rec * 2 kwic_args.leftctx = '-{0}'.format(settings.get_int('fcs', 'kwic_context', 5)) kwic_args.rightctx = '{0}'.format(settings.get_int('fcs', 'kwic_context', 5)) page = kwic.kwicpage(kwic_args) # convert concordance local_offset = (start - 1) % max_rec if start > conc.size(): raise Exception(61, 'startRecord', 'First record position out of range') rows = [ ( kwicline['Left'][0]['str'], kwicline['Kwic'][0]['str'], kwicline['Right'][0]['str'], kwicline['ref'] ) for kwicline in page['Lines'] ][local_offset:local_offset + max_rec] return rows, conc.size()
def _is_anonymous_id(user_id): return settings.get_int('global', 'anonymous_user_id') == user_id
def export_with_norms(self, subcorpattrs='', format_num=True, ret_nums=True, subcnorm='tokens'): """ Returns a text types table containing also an information about total occurrences of respective attribute values. See corplib.texttype_values for arguments and returned value """ ans = {} if not subcorpattrs: subcorpattrs = self._corp.get_conf('SUBCORPATTRS') if not subcorpattrs: subcorpattrs = self._corp.get_conf('FULLREF') if not subcorpattrs or subcorpattrs == '#': raise TextTypesException( _('Missing display configuration of structural attributes (SUBCORPATTRS or FULLREF).' )) corpus_info = plugins.get('corparch').get_corpus_info(self._corpname) maxlistsize = settings.get_int('global', 'max_attr_list_size') # if 'live_attributes' are installed then always shrink bibliographical # entries even if their count is < maxlistsize subcorp_attr_list = re.split(r'\s*[,|]\s*', subcorpattrs) if plugins.has_plugin('live_attributes'): ans['bib_attr'] = corpus_info['metadata']['label_attr'] list_none = (ans['bib_attr'], ) tmp = [s for s in subcorp_attr_list] # making copy here if ans['bib_attr'] and ans[ 'bib_attr'] not in tmp: # if bib type is not in subcorpattrs tmp.append(ans['bib_attr']) # we add it there subcorpattrs = '|'.join( tmp) # we ignore NoSkE '|' vs. ',' stuff deliberately here else: ans['bib_attr'] = None list_none = () tt = self._tt_cache.get_values( corp=self._corp, subcorpattrs=subcorpattrs, maxlistsize=maxlistsize, shrink_list=list_none, collator_locale=corpus_info.collator_locale) self._add_tt_custom_metadata(tt) if ret_nums: struct_calc = collections.OrderedDict() for item in subcorp_attr_list: k = item.split('.')[0] struct_calc[k] = CachedStructNormsCalc(self._corp, k, subcnorm, db=plugins.get('db')) for col in reduce(lambda p, c: p + c['Line'], tt, []): if 'textboxlength' not in col: structname, attrname = col['name'].split('.') for val in col['Values']: v = struct_calc[structname].compute_norm( attrname, val['v']) val['xcnt'] = l10n.format_number( v) if format_num else v ans['Blocks'] = tt ans['Normslist'] = self._get_normslist(struct_calc.keys()[0]) else: ans['Blocks'] = tt ans['Normslist'] = [] return ans