def from_nersuite(cls, line): """Return Token given NERsuite format representation.""" line = line.rstrip('\n') fields = line.split('\t') try: tag, start, end, text = fields[:4] except ValueError: raise FormatError('NERsuite format: too few fields ("%s")' % line) try: start, end = int(start), int(end) except ValueError: raise FormatError('NERsuite format: non-int start/end ("%s")' % line) if end - start != len(text): raise FormatError('NERsuite format: length mismatch ("%s")' % line) return cls(text, start, tag, fields[4:])
def read_documents(flo, label, config=defaults): """Load documents from file-like object, return list of Document objects.""" documents = [] for ln, line in enumerate(flo, start=1): line = line.strip() if not line: raise FormatError('empty line {} in {}'.format(ln, flo.name)) token_texts = tokenize(line, config) document = make_document(token_texts, label) documents.append(document) return documents
def from_str(cls, string, discont_rule=None): try: id_, type_offsets, text = string.split('\t',2) type_, offsets = type_offsets.split(' ', 1) offsets = cls._parse_offsets(offsets) was_discontinuous = False if len(offsets) != 1: offsets, text = cls._resolve_discontinuous(offsets, text, discont_rule) was_discontinuous = True start, end = offsets[0] ann = cls(id_, type_, start, end, text) ann.skip_validation = (was_discontinuous and discont_rule != LAST_SPAN) return ann except ValueError, e: raise FormatError('Standoff: failed to parse %s' % string)
def verify_textbounds(textbounds, text): """Verify that given textbounds are valid with reference to given text. Return True on success, raise FormatError on any issue. """ for t in textbounds: if t.skip_validation: # TODO fix: hack around the constraint that discontinuous # annotations don't have access to the full text print >> sys.stderr, 'Resolve discontinuous "%s" to full span "%s"' \ % (t.text, text[t.start:t.end]) t.text = text[t.start:t.end] else: try: assert t.is_valid(text) except Exception, e: s = u'Error verifying textbound %s: %s' % (t, e) raise FormatError(s.encode('utf-8'))