def matching_structattr(corp: manatee.Corpus, struct: str, attr: str, val: str, search_attr: str) -> Tuple[List[str], int, int]: """ Return a value of search_attr matching provided structural attribute [struct].[attr] = [val] """ try: size_limit = 1000000 ans = set() query = '<{struct} {attr}="{attr_val}">[]'.format(struct=struct, attr=attr, attr_val=val) conc = manatee.Concordance(corp, query, 0, -1) conc.sync() size = conc.size() kw = manatee.KWICLines(corp, conc.RS(True, 0, size_limit), '-1', '1', 'word', '', '', '={}.{}'.format(struct, search_attr)) while kw.nextline(): refs = kw.get_ref_list() if len(refs) > 0: ans.add(refs[0]) return sorted(ans), size, min(size, size_limit) except RuntimeError as ex: if 'AttrNotFound' in str(ex): return [], 0, 0 raise ex
def _load_raw_sent(self, corpus, corpus_id, token_id, kwic_len, tree_attrs): """ Retrieve a sentence via Manatee Args: corpus (manatee.Corpus): a corpus instance corpus_id (str): corpus ID token_id (int): token number/id kwic_len (int): number of tokens in KWIC tree_attrs (list of str): a list of positional attributes required by tree nodes/edges Returns (dict): data: a list of strings (Manatee raw format) kwic_pos: a tuple (first_kwic_idx, kwic_length) """ encoding = corpus.get_conf('ENCODING') sentence_struct = self._conf.get_sentence_struct(corpus_id) conc = manatee.Concordance(corpus, ' '.join( '[#%d]' % k for k in range(token_id, token_id + kwic_len)), 1, -1) conc.sync() kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1), '-1:%s' % sentence_struct, '1:%s' % sentence_struct, ','.join(tree_attrs), ','.join(tree_attrs), '', '') if kl.nextline(): left_tk = kl.get_left() kwic_tk = kl.get_kwic() return dict(data=[import_string(s, from_encoding=encoding) for s in left_tk + kwic_tk + kl.get_right()], kwic_pos=(len(left_tk) / 4, len(kwic_tk) / 4))
def get_cached_conc_sizes(self, corp, q=None, cachefile=None): """ arguments: corp -- manatee.Corpus instance q -- a list containing preprocessed query cachefile -- if not provided then the path is determined automatically using CACHE_ROOT_DIR and corpus name, corpus name and the query returns: a dictionary { finished : 0/1, concsize : int, fullsize : int, relconcsize : float (concordance size recalculated to a million corpus), arf : ARF of the result (this is calculated only for the finished result, i.e. no intermediate values) } """ import struct if q is None: q = [] ans = dict(finished=False, concsize=None, fullsize=None, relconcsize=None) if not cachefile: # AJAX call q = tuple(q) subchash = getattr(corp, 'subchash', None) cache_map = self._cache_factory.get_mapping(corp) cachefile = cache_map.cache_file_path(subchash, q) status = cache_map.get_calc_status(subchash, q) if status.error is not None: raise ConcCalculationStatusException( 'Concordance calculation failed', status.error) if cachefile and os.path.isfile(cachefile): cache = open(cachefile, 'rb') cache.seek(15) finished = bool(ord(cache.read(1))) (fullsize, ) = struct.unpack('q', cache.read(8)) cache.seek(32) (concsize, ) = struct.unpack('i', cache.read(4)) if fullsize > 0: relconcsize = 1000000.0 * fullsize / corp.search_size() else: relconcsize = 1000000.0 * concsize / corp.search_size() if finished and not is_subcorpus(corp): conc = manatee.Concordance(corp, cachefile) result_arf = round(conc.compute_ARF(), 2) else: result_arf = None ans['finished'] = finished ans['concsize'] = concsize ans['fullsize'] = fullsize ans['relconcsize'] = relconcsize ans['arf'] = result_arf return ans
def find_struct_begin(corp, alignment, sentence_attr, struct_name, struct_idx): conc = manatee.Concordance(corp, '<{0} #{1}>[]'.format(struct_name, struct_idx), 0, -1) conc.sync() if conc.size() != 1: print('ERROR: <{0} #{1}> not found'.format(struct_name, struct_idx)) _find_refs(conc, sentence_attr, alignment, struct_idx) return None
def _load_raw_sent(self, corpus, canonical_corpus_id, token_id, tree_attrs): encoding = corpus.get_conf('ENCODING') sentence_struct = self._conf.get_sentence_struct(canonical_corpus_id) conc = manatee.Concordance(corpus, '[#%d]' % token_id, 1, -1) conc.sync() kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1), '-1:%s' % sentence_struct, '1:%s' % sentence_struct, ','.join(tree_attrs), ','.join(tree_attrs), '', '') if kl.nextline(): return [ import_string(s, from_encoding=encoding) for s in kl.get_left() + kl.get_kwic() + kl.get_right() ]
def generate_kwiclines(self, query, corpus): """ Parameters ---------- query : str a query to be used to extract all tag values corpus : str a corpus name Returns ------- set a set containing all unique tag values as found in the corpus """ conc = manatee.Concordance(corpus, query, 0) kw = manatee.KWICLines(conc, '-1#', '1#', 'tag', 'tag', '', '#', 0) ans = set() for i in range(conc.size()): kw.nextline(i) ans.add(kw.get_kwic()[0].strip()) return sorted(tuple(ans))
def add_structattr_support(corp: KCorpus, attrs, token_id): """ A decorator function which turns 'fetch_posattr' into a more general function which is able to load structural attributes too. The load is performed only once for all possible structural attributes. """ data = {} refs = [x for x in attrs if '.' in x] refs_mapping = {} for n in refs: if n: lab = corp.get_conf(f'{n}.LABEL') refs_mapping[lab if lab else n] = n if len(refs) > 0: conc = manatee.Concordance(corp.unwrap(), '[#{}]'.format(int(token_id)), 1, -1) conc.sync() rs = conc.RS(True, 0, 0) kl = manatee.KWICLines(corp.unwrap(), rs, '-1', '1', 'word', '', '', ','.join(refs)) if kl.nextline(): refs_str = kl.get_refs() for kv in refs_str.split(','): if '=' in kv: k, v = kv.split('=') k = refs_mapping.get(k) data[k] = v def decorator(fn): def wrapper(corp, attr, token_id, num_tokens): if '.' in attr: return data[attr] return fn(corp, attr, token_id, num_tokens) return wrapper return decorator
def get_cached_conc_sizes(self, corp: KCorpus, q: Tuple[str, ...] = None) -> Dict[str, Any]: """ arguments: corp -- q -- a list containing preprocessed query using CACHE_ROOT_DIR and corpus name, corpus name and the query returns: a dictionary { finished : 0/1, concsize : int, fullsize : int, relconcsize : float (concordance size recalculated to a million corpus), arf : ARF of the result (this is calculated only for the finished result, i.e. no intermediate values) } """ import struct if q is None: q = () ans = dict(finished=False, concsize=0, fullsize=0, relconcsize=0, error=None) cache_map = self._cache_factory.get_mapping(corp) status = cache_map.get_calc_status(corp.subchash, q) if not status: raise ConcCalculationStatusException( 'Concordance calculation not found', None) status.check_for_errors(TASK_TIME_LIMIT) if status.error: ans['finished'] = True ans['error'] = status.error elif status.cachefile and os.path.isfile(status.cachefile): cache = open(status.cachefile, 'rb') cache.seek(15) finished = bool(ord(cache.read(1))) (fullsize, ) = struct.unpack('q', cache.read(8)) cache.seek(32) (concsize, ) = struct.unpack('i', cache.read(4)) if fullsize > 0: relconcsize = 1000000.0 * fullsize / corp.search_size else: relconcsize = 1000000.0 * concsize / corp.search_size if finished and not corp.is_subcorpus: conc = manatee.Concordance(corp.unwrap(), status.cachefile) result_arf = round(conc.compute_ARF(), 2) else: result_arf = None ans['finished'] = finished ans['concsize'] = concsize ans['fullsize'] = fullsize ans['relconcsize'] = relconcsize ans['arf'] = result_arf return ans