class AnnotationSet(object): def __init__(self, doc, values = [], name = "", logger = []): self.logger = logger self.name = name self.doc = doc # I've commented this out because it's expensive and I don't understand it. # values = list(set(values)) for value in values: # Ensure that there are no invalid annotations self._check_offsets(value) self._annots = {a.id: a for a in values} self._annot_types = None # Will only be populated when needed self._annotations_start = None self._annotations_end = None self._index_by_offset() def restrict(self, annotations): """Copies this annotation set, but restricts it to the given values""" # I have disabled this check for now so we can do set intersections. If this turns out to be evil I can add it # back in # We really don't want to copy annotations that aren't in this set to begin with # for annotation in annotations: # if annotation not in self: # raise ValueError("Attempted to restrict an annotation set to values which it doesn't contain.") return AnnotationSet(self.doc, values = annotations, name = self.name, logger = self.logger) def _index_by_offset(self): """(Re)generates the offset index which is stored in the form of two red black trees""" def compare_start(a, b): return a.start - b.start def compare_end(a, b): return a.end - b.end self._annotations_start = SliceableTree(self._annots.itervalues(), compare = compare_start) self._annotations_end = SliceableTree(self._annots.itervalues(), compare = compare_end) def _index_by_type(self): """Generates the type index. Only call this when you first need types, cos it's kind of expensive and also can't be used in init.""" self._annot_types = defaultdict(lambda: self.restrict([])) for annotation in self._annots.itervalues(): self._annot_types[annotation.type].append(annotation, log = False) def __len__(self): return len(self._annots) """Just calls __len__. This exists to make the API more like GATE""" size = __len__ def _check_offsets(self, annotation): """Checks the offsets for the given annotation against the document boundaries""" doc_size = self.doc.size() if annotation.start < 0: raise InvalidOffsetException("Annotation starts before 0") if annotation.end < 0: raise InvalidOffsetException("Annotation ends before 0") if annotation.start > annotation.end: raise InvalidOffsetException("Annotation ends before it starts") if annotation.start > doc_size: print >> sys.stderr, annotation.start, doc_size, self.doc.text, annotation raise InvalidOffsetException("Annotation starts after document ends") if annotation.end > doc_size: raise InvalidOffsetException("Annotation ends after document ends") return annotation def append(self, annotation, check_offsets = True, log= True): """Appends an annotation to the annotation set. Do not try to add annotations from another annotation set, as one annotation can belong to only one set, or a child of that set""" if annotation.id and annotation.id in self._annots: # Prevents duplicate annotations return None elif annotation.id is None: # Populate the annotation ID if there is none. if self._annots: annotation.id = max(self._annots.keys()) + 1 else: annotation.id = 1 if check_offsets: self._check_offsets(annotation) # Will raise exception if the annotation is out of range # Log the new annotation if log: self.logger.append({ "command": "ADD_ANNOT", "annotationSet": self.name, "startOffset": annotation.start, "endOffset": annotation.end, "annotationName": annotation.type, "featureMap": annotation.features, "annotationID": annotation.id} ) # Add the annotation to the required indices self._annots[annotation.id] = annotation self._annotations_start.insert(annotation) self._annotations_end.insert(annotation) if self._annot_types: self._annot_types[annotation.type].append(annotation, check_offsets) return annotation def add(self, start, end, annotType, features, _id = None, check_offsets = True): """Adds an new annotation with the given values""" return self.append(Annotation(self.logger, self, _id, annotType, start, end, features), check_offsets) def remove(self, annotation): """Remove the selected annotation""" self.logger.append({ "command": "REMOVE_ANNOT", "annotationSet": self.name, "annotationID": annotation.id}) del self._annots[annotation.id] self._annotations_start.remove(annotation) self._annotations_end.remove(annotation) if self._annot_types: self._annot_types[annotation.type].remove(annotation) def __iter__(self): """Allows iteration in document order""" return iter(self._annotations_start) def __getitem__(self, key): """Gets annotations of the given type""" return self.type(key) def byID(self, key): """Gets the annotation with the given ID""" return self._annots[key] def type(self, annotType): """Gets annotations of the specified type""" # Index the types the first time this is called if self._annot_types is None: self._index_by_type() if annotType is not None: return self._annot_types[annotType] else: return self.restrict(self) def typeNames(self): """Gets the names of all types in this set""" if self._annot_types is None: self._index_by_type() return self._annot_types.keys() def types(self): """Returns the dictionary index of types of annotation in this set""" if self._annot_types is None: self._index_by_type() return self._annot_types def at(self, offset): """Gets all annotations at the given offset (empty if none)""" result = self._annotations_start[I(offset)] return self.restrict(result) def firstAfter(self, offset): """Gets all annotations at the first valid position after the given offset""" result = self._annotations_start.nearest_after(I(offset)) return self.restrict(result) @support_single @support_annotation def overlapping(self, left, right): """Gets annotations overlapping with the two points""" result = self._annotations_start[I(left):I(right)] result += self._annotations_end[I(left+1):I(max(right, left+1))] # Must not end at the left offset. return self.restrict(result) @support_single @support_annotation def covering(self, left, right): """Gets annotations that completely cover the span given""" result = set(self._annotations_start[I(0):I(left)]) result.intersection_update(set(self._annotations_end[I(right):I(self.doc.size())])) return self.restrict(result) @support_annotation def within(self, left, right): """Gets annotations that fall completely within the left and right given""" result = set(self._annotations_start[I(left):I(right)]) result.intersection_update(set(self._annotations_end[I(left):I(right)])) return self.restrict(result) def after(self, offset): """Gets annotations that start after the given offset""" return self.restrict(self._annotations_start[I(offset):I(self.doc.size())]) def before(self, offset): """Gets annotations that start after the given offset""" return self.restrict(self._annotations_start[I(0):I(offset)]) def first(self): """Gets the first annotation within the annotation set""" return self._annotations_start.min() def last(self): """Gets the last annotation within the annotation set""" return self._annotations_start.max() def __contains__(self, value): """Provides annotation in annotation_set functionality""" if hasattr(value, "id"): # Annotations have ids, so check those instead. return value.id in self._annots and value in self._annots.viewvalues() return value in self._annots # On the off chance someone passed an ID in directly contains = __contains__ def __repr__(self): return repr([annotation for annotation in self])