def _export(self, text): """Export preprocessing information to the tag repository. Updates the TagRepository using the preprocessing result.""" ctag = None for sentence in text: sentence_attrs = {'id': TagId.next('s'), 'origin': PREPROCESSOR} stag = Tag('s', None, None, sentence_attrs) for token in sentence: if self._is_tag(token): if not token.startswith('</'): ctag = Tag(token[1:-1], None, None, { 'id': TagId.next('c'), 'origin': PREPROCESSOR }) else: ctag.end = last_ltag.end self.document.tags.append(ctag) ctag = None elif type(token) == tuple: ltag = self._make_ltag(token) self.document.tags.append(ltag) if stag.begin is None: stag.begin = token[3] if ctag is not None and ctag.begin is None: ctag.begin = ltag.begin last_end_offset = token[4] last_ltag = ltag else: logger.warn('Unexpected token type') stag.end = last_ltag.end self.document.tags.append(stag) # this indexing is needed because we bypassed the add_tag method on # TagRepository and instead directly appended to the tags list self.document.tags.index()
def create_tarsqi_tree(tarsqidoc, element, links=False): """Return an instance of TarsqiTree, using the tags in tarsqidoc included in an element, which is an Tag instance with name=docelement. Include links that fall within the boundaries of the elements if the optional links parameter is set to True.""" # start with a Tag with just the begin and end offsets tree = TarsqiTree(tarsqidoc, element) o1 = element.begin o2 = element.end top_tag = Tag(None, o1, o2, {}) top_node = Node(top_tag, None, tree) for tag in (tarsqidoc.tags.find_tags(SENTENCE, o1, o2) + tarsqidoc.tags.find_tags(NOUNCHUNK, o1, o2) + tarsqidoc.tags.find_tags(VERBCHUNK, o1, o2) + tarsqidoc.tags.find_tags(LEX, o1, o2) + tarsqidoc.tags.find_tags(EVENT, o1, o2) + tarsqidoc.tags.find_tags(TIMEX, o1, o2)): try: top_node.insert(tag) except NodeInsertionError: tree.orphans.append(tag) top_node.set_positions() top_node.set_event_markers() # recursively import all nodes into the doc, but skip the topnode itself top_node.add_to_tree(tree) if links: tree.initialize_alinks(tarsqidoc.tags.find_linktags(ALINK, o1, o2)) tree.initialize_slinks(tarsqidoc.tags.find_linktags(SLINK, o1, o2)) tree.initialize_tlinks(tarsqidoc.tags.find_linktags(TLINK, o1, o2)) return tree
def _create_event_tag(event_id, event): feats = { 'class': event['class'], 'eid': 'e' + event_id, 'eiid': 'ei' + event_id } return Tag('EVENT', event['start'], event['end'], feats)
def _export_sentence(self, s_begin, s_end): """Add an s tag to the TagRepository of the TarsqiDocument.""" if s_begin is not None: stag = Tag('s', s_begin, s_end, { 'id': TagId.next('s'), 'origin': TOKENIZER }) self.document.tags.append(stag)
def _make_ltag(token): """Return an instance of Tag for the token.""" return Tag( 'lex', token[3], token[4], { 'id': TagId.next('l'), 'lemma': token[2], 'pos': token[1], 'text': token[0], 'origin': PREPROCESSOR })
def export(text, tarsqidoc): """Export preprocessing information to the tag repository. Updates the TagRepository with the text that is the result of preprocessing.""" ctag = None for sentence in text: stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR}) for token in sentence: if type(token) == StringType and token.startswith( '<') and token.endswith('>'): if not token.startswith('</'): ctag = Tag(TagId.next('c'), token[1:-1], None, None, {'origin': PREPROCESSOR}) else: ctag.end = last_ltag.end tarsqidoc.tags.append(ctag) ctag = None elif type(token) == TupleType: ltag = Tag( TagId.next('l'), 'lex', token[3], token[4], { 'lemma': token[2], 'pos': token[1], 'text': token[0], 'origin': PREPROCESSOR }) tarsqidoc.tags.append(ltag) if stag.begin is None: stag.begin = token[3] if ctag is not None and ctag.begin is None: ctag.begin = ltag.begin last_end_offset = token[4] last_ltag = ltag else: logger.warn('Unexpected token type') stag.end = last_ltag.end tarsqidoc.tags.append(stag) # indexing is needed because we bypassed the add_tag method on TagRepository # and instead directly appended to the tags list tarsqidoc.tags.index()
def _export(self, text): """Export preprocessing information to the tag repository. Updates the TagRepository using the preprocessing result.""" ctag = None for sentence in text: sentence_attrs = { 'id': TagId.next('s'), 'origin': PREPROCESSOR } stag = Tag('s', None, None, sentence_attrs) for token in sentence: if self._is_tag(token): if not token.startswith('</'): ctag = Tag(token[1:-1], None, None, { 'id': TagId.next('c'), 'origin': PREPROCESSOR }) else: ctag.end = last_ltag.end self.document.tags.append(ctag) ctag = None elif type(token) == TupleType: ltag = self._make_ltag(token) self.document.tags.append(ltag) if stag.begin is None: stag.begin = token[3] if ctag is not None and ctag.begin is None: ctag.begin = ltag.begin last_end_offset = token[4] last_ltag = ltag else: logger.warn('Unexpected token type') stag.end = last_ltag.end self.document.tags.append(stag) # this indexing is needed because we bypassed the add_tag method on # TagRepository and instead directly appended to the tags list self.document.tags.index()
def as_ttk_tag(self): tagname = self.classname.upper() identifier = self.mention_id[15:] if tagname == 'EVENT': feats = { 'class': self.attributes['classType'], 'eid': 'e' + identifier, 'eiid': 'ei' + identifier } elif tagname == 'TIMEX3': # TODO: value is not the right format feats = { 'type': self.attributes['typeInfo'], 'value': self.attributes['value'], 'tid': 't' + identifier } else: feats = {} return Tag(tagname, self.start, self.end, feats)
def export(self, tarsqi_tags): """Saves all annotations in a TTK file.""" tarsqidoc = _get_tarsqidoc(self.text_file, "text") for annotation in self.annotations.values(): tags = [] tag = annotation.as_ttk_tag() tags.append(tag) for rel in annotation.relations: att1 = 'timeID' if annotation.classname == 'Timex3' else 'eventID' val1 = tag.attrs.get('tid', tag.attrs.get('eiid')) target = self.annotations[rel[1]] target_tag = target.as_ttk_tag() att2 = RELATED_TO_TIME if target_tag.name == EVENT: att2 = RELATED_TO_EVENT_INSTANCE val2 = target_tag.attrs.get(TID, target_tag.attrs.get(EIID)) feats = {'relType': rel[0], att1: val1, att2: val2} tags.append(Tag(TLINK, -1, -1, feats)) tagrepo = tarsqidoc.tags if tarsqi_tags else tarsqidoc.sourcedoc.tags for t in tags: tagrepo.append(t) tarsqidoc.print_all(self.ttk_file)
def _export_tokens(self, tokens): """Add s tags and lex tags to the TagRepository of the TarsqiDocument.""" tokens = self._filter_tokens(tokens) s_begin, s_end = None, None for t in tokens: if t == '<s>': self._export_sentence(s_begin, s_end) s_begin, s_end = None, None else: begin, end = t.begin, t.end lid = TagId.next('l') ltag = Tag('lex', begin, end, { 'id': lid, 'text': t.text, 'origin': TOKENIZER }) self.document.tags.append(ltag) if s_begin is None: s_begin = begin s_end = end self._export_sentence(s_begin, s_end) self.document.tags.index()
def _export_chunks(self, text): """Export ng and vg tags to the TagRepository on the TarsqiDocument.""" for sentence in text: in_chunk = False chunk_begin = None chunk_end = None for token in sentence: if token in ('<ng>', '<vg>'): in_chunk = True chunk_begin = None chunk_end = None elif token in ('</ng>', '</vg>'): in_chunk = False chunk_tag = token[2:-1] ctag = Tag(chunk_tag, chunk_begin, chunk_end, { 'id': TagId.next('c'), 'origin': CHUNKER }) self.document.tags.append(ctag) elif in_chunk: if chunk_begin is None: chunk_begin = token[3] chunk_end = token[4] self.document.tags.index()
def export(text, tarsqidoc): """Export preprocessing information to the tag repository. Updates the TagRepository with the text that is the result of preprocessing.""" ctag = None for sentence in text: stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR}) for token in sentence: if type(token) == StringType and token.startswith('<') and token.endswith('>'): if not token.startswith('</'): ctag = Tag(TagId.next('c'), token[1:-1], None, None, {'origin': PREPROCESSOR}) else: ctag.end = last_ltag.end tarsqidoc.tags.append(ctag) ctag = None elif type(token) == TupleType: ltag = Tag(TagId.next('l'), 'lex', token[3], token[4], { 'lemma': token[2], 'pos': token[1], 'text': token[0], 'origin': PREPROCESSOR }) tarsqidoc.tags.append(ltag) if stag.begin is None: stag.begin = token[3] if ctag is not None and ctag.begin is None: ctag.begin = ltag.begin last_end_offset = token[4] last_ltag = ltag else: logger.warn('Unexpected token type') stag.end = last_ltag.end tarsqidoc.tags.append(stag) # indexing is needed because we bypassed the add_tag method on TagRepository # and instead directly appended to the tags list tarsqidoc.tags.index()