Exemplo n.º 1
0
 def _export(self, text):
     """Export preprocessing information to the tag repository. Updates the
     TagRepository using the preprocessing result."""
     ctag = None
     for sentence in text:
         sentence_attrs = {'id': TagId.next('s'), 'origin': PREPROCESSOR}
         stag = Tag('s', None, None, sentence_attrs)
         for token in sentence:
             if self._is_tag(token):
                 if not token.startswith('</'):
                     ctag = Tag(token[1:-1], None, None, {
                         'id': TagId.next('c'),
                         'origin': PREPROCESSOR
                     })
                 else:
                     ctag.end = last_ltag.end
                     self.document.tags.append(ctag)
                     ctag = None
             elif type(token) == tuple:
                 ltag = self._make_ltag(token)
                 self.document.tags.append(ltag)
                 if stag.begin is None:
                     stag.begin = token[3]
                 if ctag is not None and ctag.begin is None:
                     ctag.begin = ltag.begin
                 last_end_offset = token[4]
                 last_ltag = ltag
             else:
                 logger.warn('Unexpected token type')
         stag.end = last_ltag.end
         self.document.tags.append(stag)
         # this indexing is needed because we bypassed the add_tag method on
         # TagRepository and instead directly appended to the tags list
         self.document.tags.index()
Exemplo n.º 2
0
Arquivo: tree.py Projeto: jasonzou/ttk
def create_tarsqi_tree(tarsqidoc, element, links=False):
    """Return an instance of TarsqiTree, using the tags in tarsqidoc included in
    an element, which is an Tag instance with name=docelement. Include links that
    fall within the boundaries of the elements if the optional links parameter
    is set to True."""
    # start with a Tag with just the begin and end offsets
    tree = TarsqiTree(tarsqidoc, element)
    o1 = element.begin
    o2 = element.end
    top_tag = Tag(None, o1, o2, {})
    top_node = Node(top_tag, None, tree)
    for tag in (tarsqidoc.tags.find_tags(SENTENCE, o1, o2) +
                tarsqidoc.tags.find_tags(NOUNCHUNK, o1, o2) +
                tarsqidoc.tags.find_tags(VERBCHUNK, o1, o2) +
                tarsqidoc.tags.find_tags(LEX, o1, o2) +
                tarsqidoc.tags.find_tags(EVENT, o1, o2) +
                tarsqidoc.tags.find_tags(TIMEX, o1, o2)):
        try:
            top_node.insert(tag)
        except NodeInsertionError:
            tree.orphans.append(tag)
    top_node.set_positions()
    top_node.set_event_markers()
    # recursively import all nodes into the doc, but skip the topnode itself
    top_node.add_to_tree(tree)
    if links:
        tree.initialize_alinks(tarsqidoc.tags.find_linktags(ALINK, o1, o2))
        tree.initialize_slinks(tarsqidoc.tags.find_linktags(SLINK, o1, o2))
        tree.initialize_tlinks(tarsqidoc.tags.find_linktags(TLINK, o1, o2))
    return tree
Exemplo n.º 3
0
def _create_event_tag(event_id, event):
    feats = {
        'class': event['class'],
        'eid': 'e' + event_id,
        'eiid': 'ei' + event_id
    }
    return Tag('EVENT', event['start'], event['end'], feats)
Exemplo n.º 4
0
 def _export_sentence(self, s_begin, s_end):
     """Add an s tag to the TagRepository of the TarsqiDocument."""
     if s_begin is not None:
         stag = Tag('s', s_begin, s_end, {
             'id': TagId.next('s'),
             'origin': TOKENIZER
         })
         self.document.tags.append(stag)
Exemplo n.º 5
0
 def _make_ltag(token):
     """Return an instance of Tag for the token."""
     return Tag(
         'lex', token[3], token[4], {
             'id': TagId.next('l'),
             'lemma': token[2],
             'pos': token[1],
             'text': token[0],
             'origin': PREPROCESSOR
         })
Exemplo n.º 6
0
def export(text, tarsqidoc):
    """Export preprocessing information to the tag repository. Updates the
    TagRepository with the text that is the result of preprocessing."""

    ctag = None

    for sentence in text:

        stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})

        for token in sentence:

            if type(token) == StringType and token.startswith(
                    '<') and token.endswith('>'):
                if not token.startswith('</'):
                    ctag = Tag(TagId.next('c'), token[1:-1], None, None,
                               {'origin': PREPROCESSOR})
                else:
                    ctag.end = last_ltag.end
                    tarsqidoc.tags.append(ctag)
                    ctag = None

            elif type(token) == TupleType:
                ltag = Tag(
                    TagId.next('l'), 'lex', token[3], token[4], {
                        'lemma': token[2],
                        'pos': token[1],
                        'text': token[0],
                        'origin': PREPROCESSOR
                    })
                tarsqidoc.tags.append(ltag)
                if stag.begin is None:
                    stag.begin = token[3]
                if ctag is not None and ctag.begin is None:
                    ctag.begin = ltag.begin
                last_end_offset = token[4]
                last_ltag = ltag

            else:
                logger.warn('Unexpected token type')

        stag.end = last_ltag.end
        tarsqidoc.tags.append(stag)

    # indexing is needed because we bypassed the add_tag method on TagRepository
    # and instead directly appended to the tags list
    tarsqidoc.tags.index()
Exemplo n.º 7
0
 def _export(self, text):
     """Export preprocessing information to the tag repository. Updates the
     TagRepository using the preprocessing result."""
     ctag = None
     for sentence in text:
         sentence_attrs = { 'id': TagId.next('s'), 'origin': PREPROCESSOR }
         stag = Tag('s', None, None, sentence_attrs)
         for token in sentence:
             if self._is_tag(token):
                 if not token.startswith('</'):
                     ctag = Tag(token[1:-1], None, None,
                                { 'id': TagId.next('c'), 'origin': PREPROCESSOR })
                 else:
                     ctag.end = last_ltag.end
                     self.document.tags.append(ctag)
                     ctag = None
             elif type(token) == TupleType:
                 ltag = self._make_ltag(token)
                 self.document.tags.append(ltag)
                 if stag.begin is None:
                     stag.begin = token[3]
                 if ctag is not None and ctag.begin is None:
                     ctag.begin = ltag.begin
                 last_end_offset = token[4]
                 last_ltag = ltag
             else:
                 logger.warn('Unexpected token type')
         stag.end = last_ltag.end
         self.document.tags.append(stag)
         # this indexing is needed because we bypassed the add_tag method on
         # TagRepository and instead directly appended to the tags list
         self.document.tags.index()
Exemplo n.º 8
0
 def as_ttk_tag(self):
     tagname = self.classname.upper()
     identifier = self.mention_id[15:]
     if tagname == 'EVENT':
         feats = {
             'class': self.attributes['classType'],
             'eid': 'e' + identifier,
             'eiid': 'ei' + identifier
         }
     elif tagname == 'TIMEX3':
         # TODO: value is not the right format
         feats = {
             'type': self.attributes['typeInfo'],
             'value': self.attributes['value'],
             'tid': 't' + identifier
         }
     else:
         feats = {}
     return Tag(tagname, self.start, self.end, feats)
Exemplo n.º 9
0
 def export(self, tarsqi_tags):
     """Saves all annotations in a TTK file."""
     tarsqidoc = _get_tarsqidoc(self.text_file, "text")
     for annotation in self.annotations.values():
         tags = []
         tag = annotation.as_ttk_tag()
         tags.append(tag)
         for rel in annotation.relations:
             att1 = 'timeID' if annotation.classname == 'Timex3' else 'eventID'
             val1 = tag.attrs.get('tid', tag.attrs.get('eiid'))
             target = self.annotations[rel[1]]
             target_tag = target.as_ttk_tag()
             att2 = RELATED_TO_TIME
             if target_tag.name == EVENT:
                 att2 = RELATED_TO_EVENT_INSTANCE
             val2 = target_tag.attrs.get(TID, target_tag.attrs.get(EIID))
             feats = {'relType': rel[0], att1: val1, att2: val2}
             tags.append(Tag(TLINK, -1, -1, feats))
         tagrepo = tarsqidoc.tags if tarsqi_tags else tarsqidoc.sourcedoc.tags
         for t in tags:
             tagrepo.append(t)
     tarsqidoc.print_all(self.ttk_file)
Exemplo n.º 10
0
 def _export_tokens(self, tokens):
     """Add s tags and lex tags to the TagRepository of the TarsqiDocument."""
     tokens = self._filter_tokens(tokens)
     s_begin, s_end = None, None
     for t in tokens:
         if t == '<s>':
             self._export_sentence(s_begin, s_end)
             s_begin, s_end = None, None
         else:
             begin, end = t.begin, t.end
             lid = TagId.next('l')
             ltag = Tag('lex', begin, end, {
                 'id': lid,
                 'text': t.text,
                 'origin': TOKENIZER
             })
             self.document.tags.append(ltag)
             if s_begin is None:
                 s_begin = begin
             s_end = end
     self._export_sentence(s_begin, s_end)
     self.document.tags.index()
Exemplo n.º 11
0
 def _export_chunks(self, text):
     """Export ng and vg tags to the TagRepository on the TarsqiDocument."""
     for sentence in text:
         in_chunk = False
         chunk_begin = None
         chunk_end = None
         for token in sentence:
             if token in ('<ng>', '<vg>'):
                 in_chunk = True
                 chunk_begin = None
                 chunk_end = None
             elif token in ('</ng>', '</vg>'):
                 in_chunk = False
                 chunk_tag = token[2:-1]
                 ctag = Tag(chunk_tag, chunk_begin, chunk_end, {
                     'id': TagId.next('c'),
                     'origin': CHUNKER
                 })
                 self.document.tags.append(ctag)
             elif in_chunk:
                 if chunk_begin is None:
                     chunk_begin = token[3]
                 chunk_end = token[4]
     self.document.tags.index()
Exemplo n.º 12
0
def export(text, tarsqidoc):
    """Export preprocessing information to the tag repository. Updates the
    TagRepository with the text that is the result of preprocessing."""

    ctag = None

    for sentence in text:

        stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})

        for token in sentence:

            if type(token) == StringType and token.startswith('<') and token.endswith('>'):
                if not token.startswith('</'):
                    ctag = Tag(TagId.next('c'), token[1:-1], None, None,
                               {'origin': PREPROCESSOR})
                else:
                    ctag.end = last_ltag.end
                    tarsqidoc.tags.append(ctag)
                    ctag = None

            elif type(token) == TupleType:
                ltag = Tag(TagId.next('l'), 'lex', token[3], token[4],
                           { 'lemma': token[2], 'pos': token[1], 'text': token[0],
                             'origin': PREPROCESSOR })
                tarsqidoc.tags.append(ltag)
                if stag.begin is None:
                    stag.begin = token[3]
                if ctag is not None and ctag.begin is None:
                    ctag.begin = ltag.begin
                last_end_offset = token[4]
                last_ltag = ltag

            else:
                logger.warn('Unexpected token type')

        stag.end = last_ltag.end
        tarsqidoc.tags.append(stag)

    # indexing is needed because we bypassed the add_tag method on TagRepository
    # and instead directly appended to the tags list
    tarsqidoc.tags.index()