class EmptyTreeTestCase(unittest.TestCase): """ test search on an empty tree.""" def setUp(self): self.tree = IntervalTree() def test_search(self): self.tree.search(46, 47) def test_find(self): self.tree.find(Interval(46, 47)) def test_left(self): self.tree.left(Interval(46, 47)) def test_right(self): self.tree.right(Interval(46, 47))
def test_tree_pickle(self): a = IntervalTree() for ichr in range(5): for i in range(10, 100, 6): f = Feature(i -4, i + 4, strand=1, chr=ichr) a.insert(f) a.dump('a.pkl') b = IntervalTree() b.load('a.pkl') for ichr in range(5): for i in range(10, 100, 6): f = Feature(i -4, i + 4, strand=1, chr=ichr) af = sorted(a.find(f), key=operator.attrgetter('start')) bf = sorted(b.find(f), key=operator.attrgetter('start')) assert len(bf) > 0 self.assertEqual(len(af), len(bf)) self.assertEqual(af[0].start, bf[0].start) self.assertEqual(af[-1].start, bf[-1].start)
def test_tree_pickle(self): a = IntervalTree() for ichr in range(5): for i in range(10, 100, 6): f = Interval(i - 4, i + 4) a.insert(f) a.dump('a.pkl') b = IntervalTree() b.load('a.pkl') for ichr in range(5): for i in range(10, 100, 6): f = Interval(i - 4, i + 4) af = sorted(a.find(f), key=operator.attrgetter('start')) bf = sorted(b.find(f), key=operator.attrgetter('start')) assert len(bf) > 0 self.assertEqual(len(af), len(bf)) self.assertEqual(af[0].start, bf[0].start) self.assertEqual(af[-1].start, bf[-1].start)
def _create_intervaltree(locs): it = IntervalTree() for k, (start, end) in locs.iterrows(): intervals = it.find(start, end) if intervals: continue it.add(start, end, k) return it
class Document(HString): def __init__(self, content, doc_id=rand_id(), language=lng.ENGLISH, preprocessors=None): super().__init__(self, 0, len(content)) self._content = preprocess(content, preprocessors) if preprocessors else content self._annotations = IntervalTree() self._doc_id = rand_id(10) if doc_id is None else doc_id self._completed = {} self._next_id = 0 self[LANGUAGE] = language self._aid_dict = {} @property def content(self) -> str: return self._content @property def doc_id(self): return self._doc_id def annotation(self, annotation_type, start=None, end=None) -> typing.List[Annotation]: try: if end is None or start is None: anno_iter = self._annotations.find(Interval(0, self.end)) else: anno_iter = filter( lambda x: x.data.overlaps(Span(start, end)), self._annotations.find(Interval(start, end))) except: return [] if annotation_type: annotation_type = annotation_type.lower() return sorted([ x.data for x in anno_iter if x.data.annotation_type.lower() == annotation_type and x.data != self ]) return sorted([x.data for x in anno_iter if x.data != self]) def annotation_by_id(self, annotation_id: int): return self._aid_dict[ annotation_id] if annotation_id in self._aid_dict else None def previous_annotation(self, annotation: Annotation, annotation_type: str = None) -> 'Annotation': if not annotation_type: annotation_type = annotation.annotation_type a = self.annotation(annotation_type, start=-1, end=annotation.start) if len(a) == 0: return Annotation(None, 0, 0, annotation_type, []) return a[-1] def next_annotation(self, annotation: Annotation, annotation_type: str = None) -> 'Annotation': if not annotation_type: annotation_type = annotation.annotation_type a = self.annotation(annotation_type, start=annotation.end, end=self.end) if len(a) == 0: return Annotation(None, 0, 0, annotation_type, []) return a[0] def create_annotation(self, type: str, start: int, end: int, attributes=None) -> Annotation: if attributes is None: attributes = [] annotation = Annotation(self, start, end, type, attributes, self._next_id) self._next_id += 1 self._annotations.insert( Interval(annotation.start, annotation.end, annotation)) self._aid_dict[annotation.annotation_id] = annotation return annotation def annotate(self, *args): for arg in args: if arg in self._completed: continue self.language().load() annotator = self.language().get_annotator(arg) if annotator: annotator.annotate(self) self._completed[arg] = '1.0' else: raise Exception("No annotator for {} annotations in {}".format( arg, self.language())) def language(self): if LANGUAGE in self.attributes: return self.attributes[LANGUAGE] return lng.UNKNOWN @staticmethod def from_spacy(parsed): document = Document(content=str(parsed)) for token in parsed: if token.lemma_.strip() != "": t = document.create_annotation( "token", token.idx, token.idx + len(token), [(type.INDEX, token.i), (type.LEMMA, token.lemma_), ("prob", token.prob), (type.PART_OF_SPEECH, PartOfSpeech.of(token.tag_))]) if token.head is token: head_idx = None else: head_idx = token.head.i if head_idx: t.add_relation(target=head_idx, type="dep", relation=token.dep_) for entity in parsed.ents: document.create_annotation(type.ENTITY, entity.start_char, entity.end_char, [(type.ENTITY_TYPE, entity.label_)]) for i, sentence in enumerate(parsed.sents): document.create_annotation(type.SENTENCE, sentence.start_char, sentence.end_char, [(type.INDEX, i)]) for np in parsed.noun_chunks: document.create_annotation( type.PHRASE_CHUNK, np.start_char, np.end_char, [(type.PART_OF_SPEECH, PennTreebank.NP)]) @staticmethod def from_json(json_str): doc = Document(content='') doc.__read_json(json.loads(json_str)) return doc def __getstate__(self): return self.to_dict() def __setstate__(self, state): self.__read_json(state) def __read_json(self, obj): self.__init__(content=obj['content']) self._doc_id = obj.get('id', self._doc_id) for (k, v) in obj.get("attributes", {}).items(): self[k] = get_decoder(k)(v) for (k, v) in obj.get('completed', {}).items(): self._completed[k] = v max_id = -1 for annotation in obj.get("annotations", []): ann = Annotation( document=self, start=annotation["start"], end=annotation["end"], annotation_type=annotation["type"], attributes=[ (k, get_decoder(k)(v)) for k, v in annotation.get("attributes", {}).items() ], annotation_id=annotation["id"]) max_id = max(max_id, ann.annotation_id) self._annotations.add(ann.start, ann.end, ann) #self._annotations.add(ann) for rel in annotation.get("relations", []): ann.add_relation(target=rel["target"], type=rel["type"], relation=rel["value"]) self.language().load() self._next_id = max_id + 1 def to_json(self) -> str: return json.dumps(self.to_dict(), default=default) def to_dict(self) -> typing.Dict[str, typing.Any]: return dict([ ("id", self._doc_id), ("content", self.content), ("attributes", self._attributes), ("completed", self._completed), ("annotations", [a.as_dict() for a in self.annotation(annotation_type=None)]) ])