def reconcile(self, sents): """ Update this document with the newly annotated tokens. """ # TIMEXes need unique IDs all_ts = set() for sent in sents: for (tok, pos, ts) in sent: for t in ts: all_ts.add(t) add_timex_ids(all_ts) self._sents = copy.deepcopy(sents)
def test_assign_IDs(self): # Get some sample IDs ts = set([Timex(), Timex(), Timex()]) add_timex_ids(ts) # Get the assigned IDs tids = set() for t in ts: tids.add(t.id) # Should be exactly 3 unique IDs self.assertEquals(len(tids), 3) # Should be consecutive self.assertTrue(1 in tids) self.assertTrue(2 in tids) self.assertTrue(3 in tids)
def get_extents(self): """ Print out the format suitable for timex-extents.tab """ # TIMEXes need unique IDs all_ts = set() for sent in self._sents: for (tok, pos, ts) in sent: for t in ts: all_ts.add(t) add_timex_ids(all_ts) s = "" for i in range(len(self._sents)): for j in range(len(self._sents[i])): for timex in self._sents[i][j][2]: s += self._get_timex_line(i, j, timex) + "\n" return s
def test_assign_IDs_consecutive(self): # Get some sample IDs ts = set([Timex(), Timex(), Timex()]) at = Timex() at.id = 2 ts.add(at) add_timex_ids(ts) # Get the assigned IDs tids = set() for t in ts: tids.add(t.id) # Should be exactly 4 unique IDs and pre-assigned one hasn't changed self.assertEquals(len(tids), 4) self.assertEquals(2, at.id) # Should be consecutive for new ones self.assertTrue(1 in tids) self.assertTrue(2 in tids) self.assertTrue(3 in tids) self.assertTrue(4 in tids)
def reconcile(self, sents, add_S=False, add_LEX=False, pos_attr=False): """ Reconciles this document against the new internal representation. If add_S is set to anything other than False, this means tags are indicated to indicate the sentence boundaries, with the tag names being the value of add_S. add_LEX is the same, but for marking token boundaries, and pos_attr is the name of the attribute which holds the POS tag for that token. This is mainly useful for transforming the TERN documents into something that GUTime can parse. If your document already contains S and LEX tags, and add_S/add_LEX is set to add them, old S/LEX tags will be stripped first. If pos_attr is set and the attribute name differs from the old POS attribute name on the lex tag, then the old attribute will be removed. Sentence/token boundaries will not be altered in the final document unless add_S/add_LEX is set. If you have changed the token boundaries in the internal representation from the original form, but are not then adding them back in, reconciliation may give undefined results. There are some inputs which would output invalid XML. For example, if this document has elements which span multiple sentences, but not whole parts of them, then you will be unable to add XML tags and get valid XML, so failure will occur in unexpected ways. If you are adding LEX tags, and your XML document contains tags internal to tokens, then reconciliation will fail, as it expects tokens to be in a continuous piece of whitespace. """ # First, add S tags if need be. if add_S: # First, strip any old ones if self._has_S: self._strip_tags(self._xml_doc, self._has_S, self._xml_body) # Then add the new ones leftover = self._add_S_tags(self._xml_body, sents, add_S) if len(leftover) > 1: raise NestingError( 'Unable to add all S tags, possibly due to bad tag nesting' + str(leftover)) # Update what we consider to be our S tags self._has_S = add_S # Now, get a list of the S nodes, which are used to reconcile individual # tokens if self._has_S: s_nodes = self._xml_body.getElementsByTagName(self._has_S) else: # There are no S tokens in the text. So, going forward, only # consider there being one sentence, which belongs to the root node s_nodes = [self._xml_body] new_sent = [] for sent in sents: for part in sent: new_sent.append(part) sents = [new_sent] # Now, add LEX tags if need be if add_LEX: # First, strip any old ones if self._has_LEX: self._strip_tags(self._xml_doc, self._has_LEX, self._xml_body) # Now add those LEX tokens for i in range(len(sents)): self._add_LEX_tags(s_nodes[i], sents[i], add_LEX) # Update what we consider to be our LEX tags self._has_LEX = add_LEX # Now, add the POS attribute if pos_attr and self._has_LEX: # Get each LEX tag and add the attribute for i in range(len(sents)): lex_tags = s_nodes[i].getElementsByTagName(self._has_LEX) for j in range(len(sents[i])): # Strip the existing attribute if need be try: lex_tags[j].removeAttribute(self._pos_attr) except xml.dom.NotFoundErr: pass # Now set the new POS attr lex_tags[j].setAttribute(pos_attr, sents[i][j][1]) # Update what we think is the pos attr self._pos_attr = pos_attr # Strip old TIMEXes to avoid duplicates self.strip_timexes() # For XML documents, TIMEXes need unique IDs all_ts = set() for sent in sents: for (tok, pos, ts) in sent: for t in ts: all_ts.add(t) add_timex_ids(all_ts) # Now iterate over each sentence for i in range(len(sents)): # Get all timexes in this sentence timexes = set() for (word, pos, ts) in sents[i]: for t in ts: timexes.add(t) # Now, for each timex, add it to the sentence for timex in timexes: try: self._add_timex(timex, sents[i], s_nodes[i]) except NestingError as e: LOGGER.exception("Error whilst attempting to add TIMEX")
def reconcile(self, sents, add_S=False, add_LEX=False, pos_attr=False): """ Reconciles this document against the new internal representation. If add_S is set to anything other than False, this means tags are indicated to indicate the sentence boundaries, with the tag names being the value of add_S. add_LEX is the same, but for marking token boundaries, and pos_attr is the name of the attribute which holds the POS tag for that token. This is mainly useful for transforming the TERN documents into something that GUTime can parse. If your document already contains S and LEX tags, and add_S/add_LEX is set to add them, old S/LEX tags will be stripped first. If pos_attr is set and the attribute name differs from the old POS attribute name on the lex tag, then the old attribute will be removed. Sentence/token boundaries will not be altered in the final document unless add_S/add_LEX is set. If you have changed the token boundaries in the internal representation from the original form, but are not then adding them back in, reconciliation may give undefined results. There are some inputs which would output invalid XML. For example, if this document has elements which span multiple sentences, but not whole parts of them, then you will be unable to add XML tags and get valid XML, so failure will occur in unexpected ways. If you are adding LEX tags, and your XML document contains tags internal to tokens, then reconciliation will fail, as it expects tokens to be in a continuous piece of whitespace. """ # First, add S tags if need be. if add_S: # First, strip any old ones if self._has_S: self._strip_tags(self._xml_doc, self._has_S, self._xml_body) # Then add the new ones leftover = self._add_S_tags(self._xml_body, sents, add_S) if len(leftover) > 1: raise NestingError('Unable to add all S tags, possibly due to bad tag nesting' + str(leftover)) # Update what we consider to be our S tags self._has_S = add_S # Now, get a list of the S nodes, which are used to reconcile individual # tokens if self._has_S: s_nodes = self._xml_body.getElementsByTagName(self._has_S) else: # There are no S tokens in the text. So, going forward, only # consider there being one sentence, which belongs to the root node s_nodes = [self._xml_body] new_sent = [] for sent in sents: for part in sent: new_sent.append(part) sents = [new_sent] # Now, add LEX tags if need be if add_LEX: # First, strip any old ones if self._has_LEX: self._strip_tags(self._xml_doc, self._has_LEX, self._xml_body) # Now add those LEX tokens for i in range(len(sents)): self._add_LEX_tags(s_nodes[i], sents[i], add_LEX) # Update what we consider to be our LEX tags self._has_LEX = add_LEX # Now, add the POS attribute if pos_attr and self._has_LEX: # Get each LEX tag and add the attribute for i in range(len(sents)): lex_tags = s_nodes[i].getElementsByTagName(self._has_LEX) for j in range(len(sents[i])): # Strip the existing attribute if need be try: lex_tags[j].removeAttribute(self._pos_attr) except xml.dom.NotFoundErr: pass # Now set the new POS attr lex_tags[j].setAttribute(pos_attr, sents[i][j][1]) # Update what we think is the pos attr self._pos_attr = pos_attr # Strip old TIMEXes to avoid duplicates self.strip_timexes() # For XML documents, TIMEXes need unique IDs all_ts = set() for sent in sents: for (tok, pos, ts) in sent: for t in ts: all_ts.add(t) add_timex_ids(all_ts) # Now iterate over each sentence for i in range(len(sents)): # Get all timexes in this sentence timexes = set() for (word, pos, ts) in sents[i]: for t in ts: timexes.add(t) # Now, for each timex, add it to the sentence for timex in timexes: try: self._add_timex(timex, sents[i], s_nodes[i]) except NestingError as e: LOGGER.exception("Error whilst attempting to add TIMEX")