Exemplo n.º 1
0
def calculate_freqs(args):
    args = freq_calc.FreqCalsArgs(**args)
    calculate_freqs.cache_path = args.cache_path
    ans = freq_calc.calc_freqs_bg(args)
    trigger_cache_limit = settings.get_int('corpora', 'freqs_cache_min_lines', 10)
    if args.force_cache or max(len(d.get('Items', ())) for d in ans['freqs']) >= trigger_cache_limit:
        calculate_freqs.cache_data = ans
    else:
        calculate_freqs.cache_data = None
    return ans
Exemplo n.º 2
0
    def export_with_norms(self, subcorpattrs='', format_num=True, ret_nums=True, subcnorm='tokens'):
        """
        Returns a text types table containing also an information about
        total occurrences of respective attribute values.

        See corplib.texttype_values for arguments and returned value
        """
        ans = {}
        if not subcorpattrs:
            subcorpattrs = self._corp.get_conf('SUBCORPATTRS')
            if not subcorpattrs:
                subcorpattrs = self._corp.get_conf('FULLREF')
        if not subcorpattrs or subcorpattrs == '#':
            raise TextTypesException(
                _('Missing display configuration of structural attributes (SUBCORPATTRS or FULLREF).'))

        corpus_info = plugins.get('corparch').get_corpus_info(self._corpname)
        maxlistsize = settings.get_int('global', 'max_attr_list_size')
        # if 'live_attributes' are installed then always shrink bibliographical
        # entries even if their count is < maxlistsize
        subcorp_attr_list = re.split(r'\s*[,|]\s*', subcorpattrs)

        if plugins.has_plugin('live_attributes'):
            ans['bib_attr'] = corpus_info['metadata']['label_attr']
            list_none = (ans['bib_attr'], )
            tmp = [s for s in subcorp_attr_list]  # making copy here
            if ans['bib_attr'] and ans['bib_attr'] not in tmp:  # if bib type is not in subcorpattrs
                tmp.append(ans['bib_attr'])                     # we add it there
                subcorpattrs = '|'.join(tmp)  # we ignore NoSkE '|' vs. ',' stuff deliberately here
        else:
            ans['bib_attr'] = None
            list_none = ()

        tt = self._tt_cache.get_values(corp=self._corp, subcorpattrs=subcorpattrs, maxlistsize=maxlistsize,
                                       shrink_list=list_none, collator_locale=corpus_info.collator_locale)
        self._add_tt_custom_metadata(tt)

        if ret_nums:
            struct_calc = collections.OrderedDict()
            for item in subcorp_attr_list:
                k = item.split('.')[0]
                struct_calc[k] = CachedStructNormsCalc(self._corp, k, subcnorm, db=plugins.get('db'))
            for col in reduce(lambda p, c: p + c['Line'], tt, []):
                if 'textboxlength' not in col:
                    structname, attrname = col['name'].split('.')
                    for val in col['Values']:
                        v = struct_calc[structname].compute_norm(attrname, val['v'])
                        val['xcnt'] = l10n.format_number(v) if format_num else v
            ans['Blocks'] = tt
            ans['Normslist'] = self._get_normslist(struct_calc.keys()[0])
        else:
            ans['Blocks'] = tt
            ans['Normslist'] = []
        return ans
Exemplo n.º 3
0
 def text_types(self):
     ans = {}
     maxlistsize = settings.get_int('global', 'max_attr_list_size')
     subcorpattrs = self.current_corpus.get_conf('SUBCORPATTRS')
     if not subcorpattrs:
         subcorpattrs = self.current_corpus.get_conf('FULLREF')
     tt = get_tt(self.current_corpus, self).export(subcorpattrs, maxlistsize)
     for item in tt:
         for tt2 in item['Line']:
             ans[tt2['name']] = {'type': 'default', 'values': [x['v']
                                                               for x in tt2.get('Values', [])]}
     return ans
Exemplo n.º 4
0
 def text_types(self):
     ans = {}
     maxlistsize = settings.get_int('global', 'max_attr_list_size')
     subcorpattrs = self.current_corpus.get_conf('SUBCORPATTRS')
     if not subcorpattrs:
         subcorpattrs = self.current_corpus.get_conf('FULLREF')
     tt = get_tt(self.current_corpus, self).export(subcorpattrs, maxlistsize)
     for item in tt:
         for tt2 in item['Line']:
             ans[tt2['name']] = {'type': 'default', 'values': [x['v']
                                                               for x in tt2.get('Values', [])]}
     return ans
Exemplo n.º 5
0
def calculate_colls(coll_args):
    """
    arguments:
    coll_args -- dict-serialized coll_calc.CollCalcArgs
    """
    coll_args = coll_calc.CollCalcArgs(**coll_args)
    calculate_colls.cache_path = coll_args.cache_path
    ans = coll_calc.calculate_colls_bg(coll_args)
    trigger_cache_limit = settings.get_int('corpora', 'colls_cache_min_lines', 10)
    if not ans['processing'] and len(ans['data']['Items']) >= trigger_cache_limit:
        calculate_colls.cache_data = ans['data']
    else:
        calculate_colls.cache_data = None
    return ans
Exemplo n.º 6
0
def smtp_factory():
    """
    Create a new SMTP instance with some predefined stuff
    :return:
    """
    username = settings.get('mailing', 'auth_username')
    password = settings.get('mailing', 'auth_password')
    port = settings.get_int('mailing', 'smtp_port', 25)
    use_tls = settings.get_bool('mailing', 'use_tls', False)
    server = smtplib.SMTP(settings.get('mailing', 'smtp_server'), port=port)
    if use_tls:
        server.starttls()
    if username and password:
        server.login(username, password)
    return server
Exemplo n.º 7
0
def smtp_factory():
    """
    Create a new SMTP instance with some predefined stuff
    :return:
    """
    username = settings.get('mailing', 'auth_username')
    password = settings.get('mailing', 'auth_password')
    port = settings.get_int('mailing', 'smtp_port', 25)
    use_tls = settings.get_bool('mailing', 'use_tls', False)
    server = smtplib.SMTP(settings.get('mailing', 'smtp_server'), port=port)
    if use_tls:
        server.starttls()
    if username and password:
        server.login(username, password)
    return server
Exemplo n.º 8
0
def calculate_colls_mp(coll_args):
    """
    Background calculation of collocations
    using 'multiprocessing' package.
    """
    import multiprocessing

    def cache_results(cache_path, data):
        with open(cache_path, 'wb') as f:
            pickle.dump(data, f)

    ans = calculate_colls_bg(coll_args)
    if len(ans['Items']) >= settings.get_int('corpora', 'colls_cache_min_lines', 10):  # cache only if its worth it
        multiprocessing.Process(target=cache_results, args=(coll_args.cache_path, ans,)).start()
    return ans
Exemplo n.º 9
0
def clean_colls_cache():
    root_dir = settings.get('corpora', 'colls_cache_dir')
    cache_ttl = settings.get_int('corpora', 'colls_cache_ttl', 3600)
    test_time = time.time()
    all_files = os.listdir(root_dir)
    num_removed = 0
    num_error = 0
    for item in all_files:
        file_path = os.path.join(root_dir, item)
        if test_time - os.path.getmtime(file_path) >= cache_ttl:
            try:
                os.unlink(file_path)
                num_removed += 1
            except OSError:
                num_error += 1
    return dict(total_files=len(all_files), num_removed=num_removed, num_error=num_error)
Exemplo n.º 10
0
def clean_freqs_cache():
    root_dir = settings.get('corpora', 'freqs_cache_dir')
    cache_ttl = settings.get_int('corpora', 'freqs_cache_ttl', 3600)
    test_time = time.time()
    all_files = os.listdir(root_dir)
    num_removed = 0
    num_error = 0
    for item in all_files:
        file_path = os.path.join(root_dir, item)
        if test_time - os.path.getmtime(file_path) >= cache_ttl:
            try:
                os.unlink(file_path)
                num_removed += 1
            except OSError:
                num_error += 1
    return dict(total_files=len(all_files), num_removed=num_removed, num_error=num_error)
Exemplo n.º 11
0
def calculate_colls_mp(coll_args):
    """
    Background calculation of collocations
    using 'multiprocessing' package.
    """
    import multiprocessing

    def cache_results(cache_path, data):
        with open(cache_path, 'wb') as f:
            pickle.dump(data, f)

    ans = calculate_colls_bg(coll_args)
    if len(ans['Items']) >= settings.get_int('corpora',
                                             'colls_cache_min_lines',
                                             10):  # cache only if its worth it
        multiprocessing.Process(target=cache_results,
                                args=(
                                    coll_args.cache_path,
                                    ans,
                                )).start()
    return ans
Exemplo n.º 12
0
    def fcs_search(self, corp: KCorpus, corpname, fcs_query, max_rec, start):
        """
            aux function for federated content search: operation=searchRetrieve
        """
        query = fcs_query.replace('+', ' ')  # convert URL spaces
        exact_match = True  # attr=".*value.*"
        if 'exact' in query.lower() and '=' not in query:  # lemma EXACT "dog"
            pos = query.lower().index('exact')  # first occurrence of EXACT
            query = query[:pos] + '=' + query[pos + 5:]  # 1st exact > =
            exact_match = True

        attrs = corp.get_posattrs()  # list of available attrs
        try:  # parse query
            if '=' in query:  # lemma=word | lemma="word" | lemma="w1 w2" | word=""
                attr, term = query.split('=')
                attr = attr.strip()
                term = term.strip()
            else:  # "w1 w2" | "word" | word
                attr = 'word'
                # use one of search attributes if in corpora attributes
                # otherwise use `word` - fails below if not valid
                for sa in self.search_attrs:
                    if sa in attrs:
                        attr = sa
                        break
                term = query.strip()
            if '"' in attr:
                raise Exception
            if '"' in term:  # "word" | "word1 word2" | "" | "it is \"good\""
                if term[0] != '"' or term[-1] != '"':  # check q. marks
                    raise Exception
                term = term[1:-1].strip()  # remove quotation marks
                if ' ' in term:  # multi-word term
                    if exact_match:
                        rq = ' '.join(
                            ['[%s="%s"]' % (attr, t) for t in term.split()])
                    else:
                        rq = ' '.join([
                            '[%s=".*%s.*"]' % (attr, t) for t in term.split()
                        ])
                elif term.strip() == '':  # ""
                    raise Exception  # empty term
                else:  # one-word term
                    if exact_match:
                        rq = '[%s="%s"]' % (attr, term)
                    else:
                        rq = '[%s=".*%s.*"]' % (attr, term)
            else:  # must be single-word term
                if ' ' in term:
                    raise Exception
                if exact_match:  # build query
                    rq = '[%s="%s"]' % (attr, term)
                else:
                    rq = '[%s=".*%s.*"]' % (attr, term)
        except:  # there was a problem when parsing
            raise Exception(10, query, 'Query syntax error')
        if attr not in attrs:
            raise Exception(16, attr, 'Unsupported index')

        fromp = int(math.floor((start - 1) / max_rec)) + 1
        # try to get concordance
        try:
            anon_id = plugins.runtime.AUTH.instance.anonymous_user()['id']
            q = ['q' + rq]
            conc = get_conc(corp,
                            anon_id,
                            q=q,
                            fromp=fromp,
                            pagesize=max_rec,
                            asnc=0)
        except Exception as e:
            raise Exception(10, repr(e), 'Query syntax error')

        kwic = kwiclib.Kwic(corp, corpname, conc)
        kwic_args = kwiclib.KwicPageArgs({'structs': ''},
                                         base_attr=Kontext.BASE_ATTR)
        kwic_args.fromp = fromp
        kwic_args.pagesize = max_rec
        kwic_args.leftctx = '-{0}'.format(
            settings.get_int('fcs', 'kwic_context', 5))
        kwic_args.rightctx = '{0}'.format(
            settings.get_int('fcs', 'kwic_context', 5))
        page = kwic.kwicpage(kwic_args)  # convert concordance

        local_offset = (start - 1) % max_rec
        if start - 1 > conc.size():
            raise Exception(61, 'startRecord',
                            'First record position out of range')
        rows = [(kwicline['Left'][0]['str'], kwicline['Kwic'][0]['str'],
                 kwicline['Right'][0]['str'], kwicline['ref'])
                for kwicline in page['Lines']
                ][local_offset:local_offset + max_rec]
        return rows, conc.size()
Exemplo n.º 13
0
    def export_with_norms(self, subcorpattrs='', ret_nums=True, subcnorm='tokens'):
        """
        Returns a text types table containing also an information about
        total occurrences of respective attribute values.

        See corplib.texttype_values for arguments and returned value
        """
        ans = {}
        if not subcorpattrs:
            subcorpattrs = self._corp.get_conf('SUBCORPATTRS')
            if not subcorpattrs:
                subcorpattrs = self._corp.get_conf('FULLREF')
        if not subcorpattrs or subcorpattrs == '#':
            raise TextTypesException(
                _('Missing display configuration of structural attributes (SUBCORPATTRS or FULLREF).'))

        corpus_info = plugins.runtime.CORPARCH.instance.get_corpus_info(
            self._plugin_api.user_lang, self._corpname)
        maxlistsize = settings.get_int('global', 'max_attr_list_size')
        # if 'live_attributes' are installed then always shrink bibliographical
        # entries even if their count is < maxlistsize
        subcorp_attr_list_tmp = re.split(r'\s*[,|]\s*', subcorpattrs)
        subcorp_attr_list = collections.OrderedDict(
            zip(subcorp_attr_list_tmp, [None] * len(subcorp_attr_list_tmp))).keys()

        subcorpattrs = '|'.join(subcorp_attr_list)
        if len(subcorp_attr_list_tmp) != len(subcorp_attr_list):
            logging.getLogger(__name__).warning('Duplicate SUBCORPATTRS item found')

        if plugins.runtime.LIVE_ATTRIBUTES.exists:
            ans['bib_attr'] = corpus_info['metadata']['label_attr']
            ans['id_attr'] = corpus_info['metadata']['id_attr']
            # We have to ensure that the bibliography item (which uses different values
            # for labels and different values for actual identifiers) is represented
            # as an input box on client-side. Passing list_none with bib_attr element
            # to get_values()'s shrink_list ensures this.
            # Please see public/files/js/stores/textTypes/attrValues.ts for more information
            # on how is bibliography attr. box handled on client.
            list_none = (ans['bib_attr'], )
            tmp = [s for s in subcorp_attr_list]  # making copy here
            if ans['bib_attr'] and ans['bib_attr'] not in tmp:  # if bib type is not in subcorpattrs
                tmp.append(ans['bib_attr'])                     # we add it there
                subcorpattrs = '|'.join(tmp)  # we ignore NoSkE '|' vs. ',' stuff deliberately here
        else:
            ans['bib_attr'] = None
            ans['id_attr'] = None
            list_none = ()
        tt = self._tt_cache.get_values(corp=self._corp, subcorpattrs=subcorpattrs, maxlistsize=maxlistsize,
                                       shrink_list=list_none, collator_locale=corpus_info.collator_locale)
        self._add_tt_custom_metadata(tt)

        if ret_nums:
            struct_calc = collections.OrderedDict()
            for item in subcorp_attr_list:
                k = item.split('.')[0]
                struct_calc[k] = CachedStructNormsCalc(self._corp, k, subcnorm, self._tt_cache)
            cache_ok = True
            for col in reduce(lambda p, c: p + c['Line'], tt, []):
                if 'textboxlength' not in col:
                    structname, attrname = col['name'].split('.')
                    for val in col['Values']:
                        try:
                            v = struct_calc[structname].compute_norm(attrname, val['v'])
                        except KeyError:
                            v = 0  # no problem here as the value is actually not required by subcorpattrs
                            cache_ok = False
                        val['xcnt'] = v
            if not cache_ok:
                self._tt_cache.clear(self._corp)
                logging.getLogger(__name__).warning(
                    'Removed invalid tt cache entry for corpus {0}'.format(self._corpname))
            ans['Blocks'] = tt
            ans['Normslist'] = self._get_normslist(list(struct_calc.keys())[0])
        else:
            ans['Blocks'] = tt
            ans['Normslist'] = []
        return ans
Exemplo n.º 14
0
except ImportError:
    import pickle
import hashlib
import os
import time

import corplib
import conclib
from bgcalc import freq_calc
from l10n import import_string
import settings
from structures import FixedDict
from bgcalc import UnfinishedConcordanceError
from translation import ugettext as _

TASK_TIME_LIMIT = settings.get_int('global', 'calc_backend_time_limit', 300)


class CollCalcArgs(FixedDict):
    """
    Collects all the required arguments passed around when
    calculating collocation profiles.
    """
    q = None
    user_id = None
    corpname = None
    corpus_encoding = None
    subcname = None
    subcpath = None
    num_lines = None
    collpage = None
Exemplo n.º 15
0
 def test_get_int_float_val(self):
     with self.assertRaises(ValueError):
         settings.get_int('global', 'weight')
Exemplo n.º 16
0
 def test_get_int_default(self):
     v = settings.get_int('global', 'zzz', 10)
     self.assertEqual(v, 10)
Exemplo n.º 17
0
 def test_get_int_float_val(self):
     with self.assertRaises(ValueError):
         settings.get_int('global', 'weight')
Exemplo n.º 18
0
 def min_cached_data_size(self):
     return settings.get_int('corpora', 'freqs_cache_min_lines', FreqCalc.DEFAULT_MIN_CACHED_FILE_ITEMS)
Exemplo n.º 19
0
 def _user_is_anonymous(self):
     return self._session_get('user', 'id') == settings.get_int('global', 'anonymous_user_id')
Exemplo n.º 20
0
import pickle
import hashlib
import os
import time

import corplib
from conclib.search import get_conc
from bgcalc import freq_calc
import settings
from structures import FixedDict
from bgcalc import UnfinishedConcordanceError
from translation import ugettext as _
import bgcalc

TASK_TIME_LIMIT = settings.get_int('calc_backend', 'task_time_limit', 300)


class CollCalcArgs(FixedDict):
    """
    Collects all the required arguments passed around when
    calculating collocation profiles.
    """
    q = None
    user_id = None
    corpname = None
    corpus_encoding = None
    subcname = None
    subcpath = None
    num_lines = None
    collpage = None
Exemplo n.º 21
0
    def export_with_norms(self, subcorpattrs='', ret_nums=True, subcnorm='tokens'):
        """
        Returns a text types table containing also an information about
        total occurrences of respective attribute values.

        See corplib.texttype_values for arguments and returned value
        """
        ans = {}
        if not subcorpattrs:
            subcorpattrs = self._corp.get_conf('SUBCORPATTRS')
            if not subcorpattrs:
                subcorpattrs = self._corp.get_conf('FULLREF')
        if not subcorpattrs or subcorpattrs == '#':
            raise TextTypesException(
                _('Missing display configuration of structural attributes (SUBCORPATTRS or FULLREF).'))

        corpus_info = plugins.runtime.CORPARCH.instance.get_corpus_info(self._plugin_api.user_lang, self._corpname)
        maxlistsize = settings.get_int('global', 'max_attr_list_size')
        # if 'live_attributes' are installed then always shrink bibliographical
        # entries even if their count is < maxlistsize
        subcorp_attr_list_tmp = re.split(r'\s*[,|]\s*', subcorpattrs)
        subcorp_attr_list = collections.OrderedDict(zip(subcorp_attr_list_tmp, [None]*len(subcorp_attr_list_tmp))).keys()
        subcorpattrs = '|'.join(subcorp_attr_list)
        if len(subcorp_attr_list_tmp) != len(subcorp_attr_list):
            logging.getLogger(__name__).warning('Duplicate SUBCORPATTRS item found')

        if plugins.runtime.LIVE_ATTRIBUTES.exists:
            ans['bib_attr'] = corpus_info['metadata']['label_attr']
            ans['id_attr'] = corpus_info['metadata']['id_attr']

            # We have to ensure that the bibliography item (which uses different values
            # for labels and different values for actual identifiers) is represented
            # as an input box on client-side. Passing list_none with bib_attr element
            # to get_values()'s shrink_list ensures this.
            # Please see public/files/js/stores/textTypes/attrValues.ts for more information
            # on how is bibliography attr. box handled on client.
            list_none = (ans['bib_attr'], )
            tmp = [s for s in subcorp_attr_list]  # making copy here
            if ans['bib_attr'] and ans['bib_attr'] not in tmp:  # if bib type is not in subcorpattrs
                tmp.append(ans['bib_attr'])                     # we add it there
                subcorpattrs = '|'.join(tmp)  # we ignore NoSkE '|' vs. ',' stuff deliberately here
        else:
            ans['bib_attr'] = None
            ans['id_attr'] = None
            list_none = ()
        tt = self._tt_cache.get_values(corp=self._corp, subcorpattrs=subcorpattrs, maxlistsize=maxlistsize,
                                       shrink_list=list_none, collator_locale=corpus_info.collator_locale)
        self._add_tt_custom_metadata(tt)

        if ret_nums:
            struct_calc = collections.OrderedDict()
            for item in subcorp_attr_list:
                k = item.split('.')[0]
                struct_calc[k] = CachedStructNormsCalc(self._corp, k, subcnorm, db=plugins.runtime.DB.instance)
            for col in reduce(lambda p, c: p + c['Line'], tt, []):
                if 'textboxlength' not in col:
                    structname, attrname = col['name'].split('.')
                    for val in col['Values']:
                        v = struct_calc[structname].compute_norm(attrname, val['v'])
                        val['xcnt'] = v
            ans['Blocks'] = tt
            ans['Normslist'] = self._get_normslist(struct_calc.keys()[0])
        else:
            ans['Blocks'] = tt
            ans['Normslist'] = []
        return ans
Exemplo n.º 22
0
 def test_get_int_default_type_any(self):
     v = settings.get_int('global', 'zzz', '10')
     self.assertEqual(v, 10)
Exemplo n.º 23
0
 def test_get_int_default(self):
     v = settings.get_int('global', 'zzz', 10)
     self.assertEqual(v, 10)
Exemplo n.º 24
0
 def test_get_int_non_parseable_str_val(self):
     with self.assertRaises(ValueError):
         settings.get_int('global', 'foo')
Exemplo n.º 25
0
from controller import exposed
from controller.errors import FunctionNotSupported, UserActionException
from controller.kontext import AsyncTaskStatus
from controller.querying import Querying
from main_menu import MainMenu
from translation import ugettext as translate
import plugins
import l10n
from l10n import import_string
import corplib
from texttypes import TextTypeCollector, get_tt
import settings
import argmapping

TASK_TIME_LIMIT = settings.get_int('calc_backend', 'task_time_limit', 300)


class SubcorpusError(Exception):
    pass


class Subcorpus(Querying):

    def __init__(self, request, ui_lang):
        super(Subcorpus, self).__init__(request, ui_lang)

    def get_mapping_url_prefix(self):
        return '/subcorpus/'

    def prepare_subc_path(self, corpname, subcname, publish):
Exemplo n.º 26
0
 def test_get_int(self):
     v = settings.get_int('global', 'height')
     self.assertEqual(v, 1000)
Exemplo n.º 27
0
 def min_cached_data_size(self):
     return settings.get_int('corpora', 'freqs_cache_min_lines',
                             FreqCalc.DEFAULT_MIN_CACHED_FILE_ITEMS)
Exemplo n.º 28
0
 def test_get_int_non_parseable_str_val(self):
     with self.assertRaises(ValueError):
         settings.get_int('global', 'foo')
Exemplo n.º 29
0
    def fcs_search(self, corp, corpname, fcs_query, max_rec, start):
        """
            aux function for federated content search: operation=searchRetrieve
        """
        query = fcs_query.replace('+', ' ')  # convert URL spaces
        exact_match = True  # attr=".*value.*"
        if 'exact' in query.lower() and '=' not in query:  # lemma EXACT "dog"
            pos = query.lower().index('exact')  # first occurrence of EXACT
            query = query[:pos] + '=' + query[pos + 5:]  # 1st exact > =
            exact_match = True

        attrs = corp.get_conf('ATTRLIST').split(',')  # list of available attrs
        rq = ''  # query for manatee
        try:  # parse query
            if '=' in query:  # lemma=word | lemma="word" | lemma="w1 w2" | word=""
                attr, term = query.split('=')
                attr = attr.strip()
                term = term.strip()
            else:  # "w1 w2" | "word" | word
                attr = 'word'
                # use one of search attributes if in corpora attributes
                # otherwise use `word` - fails below if not valid
                for sa in self.search_attrs:
                    if sa in attrs:
                        attr = sa
                        break
                term = query.strip()
            if '"' in attr:
                raise Exception
            if '"' in term:  # "word" | "word1 word2" | "" | "it is \"good\""
                if term[0] != '"' or term[-1] != '"':  # check q. marks
                    raise Exception
                term = term[1:-1].strip()  # remove quotation marks
                if ' ' in term:  # multi-word term
                    if exact_match:
                        rq = ' '.join(['[%s="%s"]' % (attr, t)
                                       for t in term.split()])
                    else:
                        rq = ' '.join(['[%s=".*%s.*"]' % (attr, t)
                                       for t in term.split()])
                elif term.strip() == '':  # ""
                    raise Exception  # empty term
                else:  # one-word term
                    if exact_match:
                        rq = '[%s="%s"]' % (attr, term)
                    else:
                        rq = '[%s=".*%s.*"]' % (attr, term)
            else:  # must be single-word term
                if ' ' in term:
                    raise Exception
                if exact_match:  # build query
                    rq = '[%s="%s"]' % (attr, term)
                else:
                    rq = '[%s=".*%s.*"]' % (attr, term)
        except:  # there was a problem when parsing
            raise Exception(10, query, 'Query syntax error')
        if attr not in attrs:
            raise Exception(16, attr, 'Unsupported index')

        fromp = int(math.floor((start - 1) / max_rec)) + 1
        # try to get concordance
        try:
            anon_id = plugins.runtime.AUTH.instance.anonymous_user()['id']
            q = ['q' + rq]
            conc = conclib.get_conc(corp, anon_id, q=q, fromp=fromp, pagesize=max_rec * 2, async=0)
        except Exception as e:
            raise Exception(10, repr(e), 'Query syntax error')

        kwic = kwiclib.Kwic(corp, corpname, conc)
        kwic_args = kwiclib.KwicPageArgs(Args(), base_attr=Kontext.BASE_ATTR)
        kwic_args.fromp = fromp
        kwic_args.pagesize = max_rec * 2
        kwic_args.leftctx = '-{0}'.format(settings.get_int('fcs', 'kwic_context', 5))
        kwic_args.rightctx = '{0}'.format(settings.get_int('fcs', 'kwic_context', 5))
        page = kwic.kwicpage(kwic_args)  # convert concordance

        local_offset = (start - 1) % max_rec
        if start > conc.size():
            raise Exception(61, 'startRecord', 'First record position out of range')
        rows = [
            (
                kwicline['Left'][0]['str'],
                kwicline['Kwic'][0]['str'],
                kwicline['Right'][0]['str'],
                kwicline['ref']
            )
            for kwicline in page['Lines']
        ][local_offset:local_offset + max_rec]
        return rows, conc.size()
Exemplo n.º 30
0
 def test_get_int_default_type_any(self):
     v = settings.get_int('global', 'zzz', '10')
     self.assertEqual(v, 10)
Exemplo n.º 31
0
 def test_get_int(self):
     v = settings.get_int('global', 'height')
     self.assertEqual(v, 1000)
Exemplo n.º 32
0
 def _is_anonymous_id(user_id):
     return settings.get_int('global', 'anonymous_user_id') == user_id
Exemplo n.º 33
0
    def export_with_norms(self,
                          subcorpattrs='',
                          format_num=True,
                          ret_nums=True,
                          subcnorm='tokens'):
        """
        Returns a text types table containing also an information about
        total occurrences of respective attribute values.

        See corplib.texttype_values for arguments and returned value
        """
        ans = {}
        if not subcorpattrs:
            subcorpattrs = self._corp.get_conf('SUBCORPATTRS')
            if not subcorpattrs:
                subcorpattrs = self._corp.get_conf('FULLREF')
        if not subcorpattrs or subcorpattrs == '#':
            raise TextTypesException(
                _('Missing display configuration of structural attributes (SUBCORPATTRS or FULLREF).'
                  ))

        corpus_info = plugins.get('corparch').get_corpus_info(self._corpname)
        maxlistsize = settings.get_int('global', 'max_attr_list_size')
        # if 'live_attributes' are installed then always shrink bibliographical
        # entries even if their count is < maxlistsize
        subcorp_attr_list = re.split(r'\s*[,|]\s*', subcorpattrs)

        if plugins.has_plugin('live_attributes'):
            ans['bib_attr'] = corpus_info['metadata']['label_attr']
            list_none = (ans['bib_attr'], )
            tmp = [s for s in subcorp_attr_list]  # making copy here
            if ans['bib_attr'] and ans[
                    'bib_attr'] not in tmp:  # if bib type is not in subcorpattrs
                tmp.append(ans['bib_attr'])  # we add it there
                subcorpattrs = '|'.join(
                    tmp)  # we ignore NoSkE '|' vs. ',' stuff deliberately here
        else:
            ans['bib_attr'] = None
            list_none = ()

        tt = self._tt_cache.get_values(
            corp=self._corp,
            subcorpattrs=subcorpattrs,
            maxlistsize=maxlistsize,
            shrink_list=list_none,
            collator_locale=corpus_info.collator_locale)
        self._add_tt_custom_metadata(tt)

        if ret_nums:
            struct_calc = collections.OrderedDict()
            for item in subcorp_attr_list:
                k = item.split('.')[0]
                struct_calc[k] = CachedStructNormsCalc(self._corp,
                                                       k,
                                                       subcnorm,
                                                       db=plugins.get('db'))
            for col in reduce(lambda p, c: p + c['Line'], tt, []):
                if 'textboxlength' not in col:
                    structname, attrname = col['name'].split('.')
                    for val in col['Values']:
                        v = struct_calc[structname].compute_norm(
                            attrname, val['v'])
                        val['xcnt'] = l10n.format_number(
                            v) if format_num else v
            ans['Blocks'] = tt
            ans['Normslist'] = self._get_normslist(struct_calc.keys()[0])
        else:
            ans['Blocks'] = tt
            ans['Normslist'] = []
        return ans