def test_contained_entities(self): sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)] entities = [Entity('', 2, 6, 'test'), Entity('', 6, 7, 'test'), Entity('', 7, 8, 'test')] expected_sentences = [Sentence(0, 2), Sentence(2, 8), Sentence(8, 10), Sentence(10, 15)] got_sentences = adjust_sentences(sentences, entities) self.assertEqual(expected_sentences, got_sentences)
def test_two_entities_separated(self): sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)] entities = [Entity('', 2, 4, 'test'), Entity('', 9, 11, 'test')] expected_sentences = [Sentence(0, 2), Sentence(2, 5), Sentence(5, 8), Sentence(8, 15)] got_sentences = adjust_sentences(sentences, entities) self.assertEqual(expected_sentences, got_sentences)
def test_multi_sentence(self): sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)] entities = [Entity('', 2, 9, 'test')] expected_sentences = [Sentence(0, 2), Sentence(2, 10), Sentence(10, 15)] got_sentences = adjust_sentences(sentences, entities) self.assertEqual(expected_sentences, got_sentences)
def test_one_entity(self): sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5)] entities = [Entity('', 2, 4, 'test')] expected_sentences = [Sentence(0, 2), Sentence(2, 5)] got_sentences = adjust_sentences(sentences, entities) self.assertEqual(expected_sentences, got_sentences)
def _merge(raw_tokens: list, sentences: list, raw_paragraphs: list, raw_entities: list, raw_relations: list, *, symmetric_types: set = None) -> Tuple[List[Sentence], List[Paragraph], List[Entity], Set[Relation]]: """ :param raw_tokens: list of tuples: (start, end, text) :param sentences: list of Sentence objects :param raw_paragraphs: list of tuples: (start, end) :param raw_entities: list of dicts: {'id', 'type', 'start', 'end'} :param raw_relations: list of dicts: {'type', 'first', 'second'} """ paragraphs = [] cur_par_idx = 0 par_start = 0 entities = sorted(align_raw_entities(raw_entities, raw_tokens)) entities_dict = {ent.id: ent for ent in entities} sentences = adjust_sentences(sentences, entities) for i, sentence in enumerate(sentences): for token in raw_tokens[sentence.start_token: sentence.end_token]: if par_start != i + 1 and (_end_of_text(sentences, raw_tokens, sentence, token, i) or _end_of_paragraph(raw_paragraphs, cur_par_idx, token)): paragraphs.append(Paragraph(par_start, i + 1)) par_start = i + 1 cur_par_idx += 1 return sentences, paragraphs, entities, _get_relations(raw_relations, entities_dict, symmetric_types)
def _get_docs(self, raw_docs: Dict[str, List[dict]], groups: Dict[str, list]) -> Dict[str, Document]: docs = {} for doc_id, raw_tokens in raw_docs.items(): tokens = [] token_features = {} sentences = [] sent_start = 0 shift2idx = {} for i, raw_token in enumerate(raw_tokens): tokens.append(raw_token['token']) token_features.setdefault('lemma', []).append(raw_token['lemma']) token_features.setdefault('gram', []).append(raw_token['gram']) if "speech" in raw_token: token_features.setdefault("speech", []).append(raw_token['speech']) token_features.setdefault("said", []).append(raw_token['said']) token_features.setdefault("author_comment", []).append( raw_token['author_comment']) token_features.setdefault("speech_verb", []).append( raw_token['speech_verb']) shift2idx[raw_token['shift']] = i if raw_token['gram'] == 'SENT': sentences.append(Sentence(sent_start, i + 1)) sent_start = i + 1 if sentences[-1].end_token != len(tokens): sentences.append(Sentence(sent_start, len(tokens))) entities = self._get_entities(groups, shift2idx, doc_id) sentences = adjust_sentences(sentences, entities) doc = Document(doc_id, tokens, sentences, [Paragraph(0, len(sentences))], entities, token_features=token_features) docs[doc_id] = doc return docs
def test_no_entities(self): sentences = [Sentence(0, 2), Sentence(2, 3)] entities = [] got_sentences = adjust_sentences(sentences, entities) self.assertEqual(sentences, got_sentences)