Exemplo n.º 1
0
 def _export(self, text):
     """Export preprocessing information to the tag repository. Updates the
     TagRepository using the preprocessing result."""
     ctag = None
     for sentence in text:
         sentence_attrs = {'id': TagId.next('s'), 'origin': PREPROCESSOR}
         stag = Tag('s', None, None, sentence_attrs)
         for token in sentence:
             if self._is_tag(token):
                 if not token.startswith('</'):
                     ctag = Tag(token[1:-1], None, None, {
                         'id': TagId.next('c'),
                         'origin': PREPROCESSOR
                     })
                 else:
                     ctag.end = last_ltag.end
                     self.document.tags.append(ctag)
                     ctag = None
             elif type(token) == tuple:
                 ltag = self._make_ltag(token)
                 self.document.tags.append(ltag)
                 if stag.begin is None:
                     stag.begin = token[3]
                 if ctag is not None and ctag.begin is None:
                     ctag.begin = ltag.begin
                 last_end_offset = token[4]
                 last_ltag = ltag
             else:
                 logger.warn('Unexpected token type')
         stag.end = last_ltag.end
         self.document.tags.append(stag)
         # this indexing is needed because we bypassed the add_tag method on
         # TagRepository and instead directly appended to the tags list
         self.document.tags.index()
Exemplo n.º 2
0
 def _export(self, text):
     """Export preprocessing information to the tag repository. Updates the
     TagRepository using the preprocessing result."""
     ctag = None
     for sentence in text:
         sentence_attrs = { 'id': TagId.next('s'), 'origin': PREPROCESSOR }
         stag = Tag('s', None, None, sentence_attrs)
         for token in sentence:
             if self._is_tag(token):
                 if not token.startswith('</'):
                     ctag = Tag(token[1:-1], None, None,
                                { 'id': TagId.next('c'), 'origin': PREPROCESSOR })
                 else:
                     ctag.end = last_ltag.end
                     self.document.tags.append(ctag)
                     ctag = None
             elif type(token) == TupleType:
                 ltag = self._make_ltag(token)
                 self.document.tags.append(ltag)
                 if stag.begin is None:
                     stag.begin = token[3]
                 if ctag is not None and ctag.begin is None:
                     ctag.begin = ltag.begin
                 last_end_offset = token[4]
                 last_ltag = ltag
             else:
                 logger.warn('Unexpected token type')
         stag.end = last_ltag.end
         self.document.tags.append(stag)
         # this indexing is needed because we bypassed the add_tag method on
         # TagRepository and instead directly appended to the tags list
         self.document.tags.index()
Exemplo n.º 3
0
def export(text, tarsqidoc):
    """Export preprocessing information to the tag repository. Updates the
    TagRepository with the text that is the result of preprocessing."""

    ctag = None

    for sentence in text:

        stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})

        for token in sentence:

            if type(token) == StringType and token.startswith(
                    '<') and token.endswith('>'):
                if not token.startswith('</'):
                    ctag = Tag(TagId.next('c'), token[1:-1], None, None,
                               {'origin': PREPROCESSOR})
                else:
                    ctag.end = last_ltag.end
                    tarsqidoc.tags.append(ctag)
                    ctag = None

            elif type(token) == TupleType:
                ltag = Tag(
                    TagId.next('l'), 'lex', token[3], token[4], {
                        'lemma': token[2],
                        'pos': token[1],
                        'text': token[0],
                        'origin': PREPROCESSOR
                    })
                tarsqidoc.tags.append(ltag)
                if stag.begin is None:
                    stag.begin = token[3]
                if ctag is not None and ctag.begin is None:
                    ctag.begin = ltag.begin
                last_end_offset = token[4]
                last_ltag = ltag

            else:
                logger.warn('Unexpected token type')

        stag.end = last_ltag.end
        tarsqidoc.tags.append(stag)

    # indexing is needed because we bypassed the add_tag method on TagRepository
    # and instead directly appended to the tags list
    tarsqidoc.tags.index()
Exemplo n.º 4
0
def export(text, tarsqidoc):
    """Export preprocessing information to the tag repository. Updates the
    TagRepository with the text that is the result of preprocessing."""

    ctag = None

    for sentence in text:

        stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})

        for token in sentence:

            if type(token) == StringType and token.startswith('<') and token.endswith('>'):
                if not token.startswith('</'):
                    ctag = Tag(TagId.next('c'), token[1:-1], None, None,
                               {'origin': PREPROCESSOR})
                else:
                    ctag.end = last_ltag.end
                    tarsqidoc.tags.append(ctag)
                    ctag = None

            elif type(token) == TupleType:
                ltag = Tag(TagId.next('l'), 'lex', token[3], token[4],
                           { 'lemma': token[2], 'pos': token[1], 'text': token[0],
                             'origin': PREPROCESSOR })
                tarsqidoc.tags.append(ltag)
                if stag.begin is None:
                    stag.begin = token[3]
                if ctag is not None and ctag.begin is None:
                    ctag.begin = ltag.begin
                last_end_offset = token[4]
                last_ltag = ltag

            else:
                logger.warn('Unexpected token type')

        stag.end = last_ltag.end
        tarsqidoc.tags.append(stag)

    # indexing is needed because we bypassed the add_tag method on TagRepository
    # and instead directly appended to the tags list
    tarsqidoc.tags.index()