Пример #1
0
    def _load_raw_sent(self, corpus, corpus_id, token_id, kwic_len, tree_attrs):
        """
        Retrieve a sentence via Manatee
        Args:
            corpus (manatee.Corpus): a corpus instance
            corpus_id (str): corpus ID
            token_id (int): token number/id
            kwic_len (int): number of tokens in KWIC
            tree_attrs (list of str): a list of positional attributes required by tree nodes/edges

        Returns (dict):
            data: a list of strings (Manatee raw format)
            kwic_pos: a tuple (first_kwic_idx, kwic_length)
        """
        encoding = corpus.get_conf('ENCODING')
        sentence_struct = self._conf.get_sentence_struct(corpus_id)
        conc = manatee.Concordance(corpus, ' '.join(
            '[#%d]' % k for k in range(token_id, token_id + kwic_len)), 1, -1)
        conc.sync()
        kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1),
                               '-1:%s' % sentence_struct,
                               '1:%s' % sentence_struct,
                               ','.join(tree_attrs),
                               ','.join(tree_attrs), '', '')
        if kl.nextline():
            left_tk = kl.get_left()
            kwic_tk = kl.get_kwic()
            return dict(data=[import_string(s, from_encoding=encoding)
                              for s in left_tk + kwic_tk + kl.get_right()],
                        kwic_pos=(len(left_tk) / 4, len(kwic_tk) / 4))
Пример #2
0
def matching_structattr(corp: manatee.Corpus, struct: str, attr: str, val: str,
                        search_attr: str) -> Tuple[List[str], int, int]:
    """
    Return a value of search_attr matching provided structural attribute
    [struct].[attr] = [val]
    """
    try:
        size_limit = 1000000
        ans = set()
        query = '<{struct} {attr}="{attr_val}">[]'.format(struct=struct,
                                                          attr=attr,
                                                          attr_val=val)
        conc = manatee.Concordance(corp, query, 0, -1)
        conc.sync()
        size = conc.size()

        kw = manatee.KWICLines(corp, conc.RS(True, 0,
                                             size_limit), '-1', '1', 'word',
                               '', '', '={}.{}'.format(struct, search_attr))
        while kw.nextline():
            refs = kw.get_ref_list()
            if len(refs) > 0:
                ans.add(refs[0])
        return sorted(ans), size, min(size, size_limit)
    except RuntimeError as ex:
        if 'AttrNotFound' in str(ex):
            return [], 0, 0
        raise ex
Пример #3
0
 def _load_raw_sent(self, corpus, canonical_corpus_id, token_id,
                    tree_attrs):
     encoding = corpus.get_conf('ENCODING')
     sentence_struct = self._conf.get_sentence_struct(canonical_corpus_id)
     conc = manatee.Concordance(corpus, '[#%d]' % token_id, 1, -1)
     conc.sync()
     kl = manatee.KWICLines(corpus, conc.RS(True, 0,
                                            1), '-1:%s' % sentence_struct,
                            '1:%s' % sentence_struct, ','.join(tree_attrs),
                            ','.join(tree_attrs), '', '')
     if kl.nextline():
         return [
             import_string(s, from_encoding=encoding)
             for s in kl.get_left() + kl.get_kwic() + kl.get_right()
         ]
Пример #4
0
    def get_groups_first_line(self):
        if not isinstance(self.conc, InitialConc):
            kl = manatee.KWICLines(
                self.conc.corp(),
                r=self.conc.RS(True, 0, int(self.conc.size())),
                left='0',
                right='0',
                kwica='',
                ctxa='',
                struca='',
                refa='',
            )
            i = 0
            while kl.nextline():
                i = i + 1
                if kl.get_linegroup():
                    return i

        return None
Пример #5
0
def _find_refs(conc, attr, alignment, idx):
    limit = 1
    leftcontext = '-1'
    rightcontext = '1'
    attrs = ''
    attrs_allpos = ''
    structs = ''
    refs = 's.id,#'
    maxcontext = 10
    kw = manatee.KWICLines(conc.corp(), conc.RS(True, 0, limit), leftcontext,
                           rightcontext, attrs, attrs_allpos, structs, refs,
                           maxcontext)
    while kw.nextline():
        refs = kw.get_refs()
        struct_id, token_idx = _parse_refs(refs)
        sent_order = attr.str2id(struct_id)
        srch = alignment.find_left_val(sent_order)
        print('#{0} -- {1} -- 1st sentence in corp: {2} -- aligndef line: {3}'.
              format(idx, struct_id, sent_order, srch))
Пример #6
0
    def generate_kwiclines(self, query, corpus):
        """
        Parameters
        ----------
        query : str
          a query to be used to extract all tag values
        corpus : str
          a corpus name

        Returns
        -------
        set
          a set containing all unique tag values as found in the corpus
        """
        conc = manatee.Concordance(corpus, query, 0)
        kw = manatee.KWICLines(conc, '-1#', '1#', 'tag', 'tag', '', '#', 0)
        ans = set()
        for i in range(conc.size()):
            kw.nextline(i)
            ans.add(kw.get_kwic()[0].strip())
        return sorted(tuple(ans))
Пример #7
0
def add_structattr_support(corp: KCorpus, attrs, token_id):
    """
    A decorator function which turns 'fetch_posattr' into
    a more general function which is able to load
    structural attributes too. The load is performed only
    once for all possible structural attributes.
    """

    data = {}
    refs = [x for x in attrs if '.' in x]
    refs_mapping = {}
    for n in refs:
        if n:
            lab = corp.get_conf(f'{n}.LABEL')
            refs_mapping[lab if lab else n] = n

    if len(refs) > 0:
        conc = manatee.Concordance(corp.unwrap(),
                                   '[#{}]'.format(int(token_id)), 1, -1)
        conc.sync()
        rs = conc.RS(True, 0, 0)
        kl = manatee.KWICLines(corp.unwrap(), rs, '-1', '1', 'word', '', '',
                               ','.join(refs))
        if kl.nextline():
            refs_str = kl.get_refs()
            for kv in refs_str.split(','):
                if '=' in kv:
                    k, v = kv.split('=')
                    k = refs_mapping.get(k)
                    data[k] = v

    def decorator(fn):
        def wrapper(corp, attr, token_id, num_tokens):
            if '.' in attr:
                return data[attr]
            return fn(corp, attr, token_id, num_tokens)

        return wrapper

    return decorator
Пример #8
0
    def kwiclines(self, args):
        """
        Generates list of 'kwic' (= keyword in context) lines according to
        the provided Concordance object and additional parameters (like
        page number, width of the left and right context etc.).

        arguments:
        args -- a KwicLinesArgs instance

        returns:
        a dictionary containing all the required line data (left context, kwic, right context,...)
        """

        # add structures needed to render speech playback information
        all_structs = args.structs
        if self.speech_segment_has_audio(args.speech_segment):
            speech_struct_attr_name = '.'.join(args.speech_segment)
            speech_struct_attr = self.corpus.get_attr(speech_struct_attr_name)
            if speech_struct_attr_name not in args.structs:
                all_structs += ',' + speech_struct_attr_name
        else:
            speech_struct_attr_name = ''
            speech_struct_attr = None

        lines = []

        if args.righttoleft:
            rightlabel, leftlabel = 'Left', 'Right'
            args.structs += ',ltr'
            # from unicodedata import bidirectional
        else:
            leftlabel, rightlabel = 'Left', 'Right'

        # self.conc.corp() must be used here instead of self.corpus
        # because in case of parallel corpora these two are different and only the latter one is correct
        if isinstance(self.conc, InitialConc):
            kl = EmptyKWiclines()
        else:
            kl = manatee.KWICLines(
                self.conc.corp(), self.conc.RS(True, args.fromline,
                                               args.toline), args.leftctx,
                args.rightctx, args.attrs, args.ctxattrs, all_structs,
                args.refs)
        labelmap = args.labelmap.copy()
        labelmap['_'] = '_'
        maxleftsize = 0
        maxrightsize = 0
        filter_out_speech_tag = args.speech_segment and args.speech_segment[0] not in args.structs \
            and speech_struct_attr_name in all_structs

        i = args.fromline
        while kl.nextline():
            linegroup = kl.get_linegroup()
            if not linegroup:  # manatee returns 0 in case of no group (but None will work too here)
                linegroup = -1  # client-side uses -1 as "no group"
            if self.speech_segment_has_audio(args.speech_segment):
                leftmost_speech_id = speech_struct_attr.pos2str(
                    kl.get_ctxbeg())
            else:
                leftmost_speech_id = None
            leftwords, last_left_speech_id = self.update_speech_boundaries(
                args.speech_segment, tokens2strclass(kl.get_left()), 'left',
                filter_out_speech_tag, leftmost_speech_id)
            kwicwords, last_left_speech_id = self.update_speech_boundaries(
                args.speech_segment, tokens2strclass(kl.get_kwic()), 'kwic',
                filter_out_speech_tag, last_left_speech_id)
            rightwords = self.update_speech_boundaries(
                args.speech_segment, tokens2strclass(kl.get_right()), 'right',
                filter_out_speech_tag, last_left_speech_id)[0]

            leftwords = self.postproc_text_chunk(leftwords)
            kwicwords = self.postproc_text_chunk(kwicwords)
            rightwords = self.postproc_text_chunk(rightwords)

            if args.righttoleft and Kwic.isengword(kwicwords[0]):
                leftwords, rightwords = Kwic.update_right_to_left(
                    leftwords, rightwords)

            leftsize = 0
            for w in leftwords:
                if not w['class'] == 'strc':
                    leftsize += len(w['str']) + 1
            if leftsize > maxleftsize:
                maxleftsize = leftsize

            rightsize = 0
            for w in rightwords:
                if not w['class'] == 'strc':
                    rightsize += len(w['str']) + 1
            if rightsize > maxrightsize:
                maxrightsize = rightsize
            line_data = dict(toknum=kl.get_pos(),
                             hitlen=Kwic.non1hitlen(kl.get_kwiclen()),
                             kwiclen=kl.get_kwiclen(),
                             ref=[s for s in kl.get_ref_list()],
                             Kwic=kwicwords,
                             linegroup=linegroup,
                             leftsize=leftsize,
                             rightsize=rightsize,
                             linenum=i)
            line_data[leftlabel] = leftwords
            line_data[rightlabel] = rightwords
            lines.append(line_data)
            i += 1
        for line in lines:
            line['leftspace'] = ' ' * (maxleftsize - line['leftsize'])
            line['rightspace'] = ' ' * (maxrightsize - line['rightsize'])
        return lines
Пример #9
0
    def kwiclines(self,
                  speech_segment,
                  fromline,
                  toline,
                  leftctx='-5',
                  rightctx='5',
                  attrs='word',
                  ctxattrs='word',
                  refs='#',
                  user_structs='p',
                  labelmap={},
                  righttoleft=False,
                  alignlist=[],
                  align_attrname='align',
                  aattrs='word',
                  astructs=''):
        """
        Generates list of 'kwic' (= keyword in context) lines according to
        the provided Concordance object and additional parameters (like
        page number, width of the left and right context etc.).

        arguments:
        speech_segment -- 2-tuple
        ...

        returns:
        a dictionary containing all the required line data (left context, kwic, right context,...)
        """

        # structs represent which structures are requested by user
        # all_structs contain also internal structures needed to render
        # additional information (like the speech links)
        all_structs = user_structs
        if speech_segment:
            speech_struct_attr_name = '.'.join(speech_segment)
            speech_struct_attr = self.corpus.get_attr(speech_struct_attr_name)
            if not speech_struct_attr_name in user_structs:
                all_structs += ',' + speech_struct_attr_name
        else:
            speech_struct_attr_name = None
            speech_struct_attr = None

        lines = []

        if righttoleft:
            rightlabel, leftlabel = 'Left', 'Right'
            user_structs += ',ltr'
            # from unicodedata import bidirectional
        else:
            leftlabel, rightlabel = 'Left', 'Right'

        # self.conc.corp() must be used here instead of self.corpus
        # because in case of parallel corpora these two are different and only the latter one is correct
        kl = manatee.KWICLines(self.conc.corp(),
                               self.conc.RS(True, fromline, toline), leftctx,
                               rightctx, attrs, ctxattrs, all_structs, refs)
        labelmap = labelmap.copy()
        labelmap['_'] = '_'
        maxleftsize = 0
        maxrightsize = 0
        filter_out_speech_tag = speech_segment and speech_segment[0] not in user_structs \
            and speech_struct_attr_name in all_structs

        i = fromline
        while kl.nextline():
            linegroup = str(kl.get_linegroup() or '_')
            linegroup = labelmap.get(linegroup, '#' + linegroup)
            if speech_segment:
                leftmost_speech_id = speech_struct_attr.pos2str(
                    kl.get_ctxbeg())
            else:
                leftmost_speech_id = None
            leftwords, last_left_speech_id = self.postproc_kwicline_part(
                speech_segment, tokens2strclass(kl.get_left()), 'left',
                filter_out_speech_tag, leftmost_speech_id)
            kwicwords, last_left_speech_id = self.postproc_kwicline_part(
                speech_segment, tokens2strclass(kl.get_kwic()), 'kwic',
                filter_out_speech_tag, last_left_speech_id)
            rightwords = self.postproc_kwicline_part(
                speech_segment, tokens2strclass(kl.get_right()), 'right',
                filter_out_speech_tag, last_left_speech_id)[0]

            if righttoleft and Kwic.isengword(kwicwords[0]):
                leftwords, rightwords = Kwic.update_right_to_left(
                    leftwords, rightwords)

            leftsize = 0
            for w in leftwords:
                if not w['class'] == 'strc':
                    leftsize += len(w['str']) + 1
            if leftsize > maxleftsize:
                maxleftsize = leftsize

            rightsize = 0
            for w in rightwords:
                if not w['class'] == 'strc':
                    rightsize += len(w['str']) + 1
            if rightsize > maxrightsize:
                maxrightsize = rightsize
            line_data = dict(toknum=kl.get_pos(),
                             hitlen=Kwic.non1hitlen(kl.get_kwiclen()),
                             kwiclen=kl.get_kwiclen(),
                             ref=self.import_string(kl.get_refs()),
                             Kwic=kwicwords,
                             linegroup=linegroup,
                             leftsize=leftsize,
                             rightsize=rightsize,
                             linenum=i)
            line_data[leftlabel] = leftwords
            line_data[rightlabel] = rightwords
            lines.append(line_data)
            i += 1
        for line in lines:
            line['leftspace'] = ' ' * (maxleftsize - line['leftsize'])
            line['rightspace'] = ' ' * (maxrightsize - line['rightsize'])
        return lines