Пример #1
0
    def test_contained_entities(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)]
        entities = [Entity('', 2, 6, 'test'), Entity('', 6, 7, 'test'), Entity('', 7, 8, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 8), Sentence(8, 10), Sentence(10, 15)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Пример #2
0
    def test_two_entities_separated(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)]
        entities = [Entity('', 2, 4, 'test'), Entity('', 9, 11, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 5), Sentence(5, 8), Sentence(8, 15)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Пример #3
0
    def test_multi_sentence(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)]
        entities = [Entity('', 2, 9, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 10), Sentence(10, 15)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Пример #4
0
    def test_one_entity(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5)]
        entities = [Entity('', 2, 4, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 5)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Пример #5
0
def _merge(raw_tokens: list, sentences: list, raw_paragraphs: list, raw_entities: list, raw_relations: list, *,
           symmetric_types: set = None) -> Tuple[List[Sentence], List[Paragraph], List[Entity], Set[Relation]]:
    """
    :param raw_tokens: list of tuples: (start, end, text)
    :param sentences: list of Sentence objects
    :param raw_paragraphs: list of tuples: (start, end)
    :param raw_entities: list of dicts: {'id', 'type', 'start', 'end'}
    :param raw_relations: list of dicts: {'type', 'first', 'second'}
    """
    paragraphs = []

    cur_par_idx = 0
    par_start = 0

    entities = sorted(align_raw_entities(raw_entities, raw_tokens))
    entities_dict = {ent.id: ent for ent in entities}
    sentences = adjust_sentences(sentences, entities)

    for i, sentence in enumerate(sentences):
        for token in raw_tokens[sentence.start_token: sentence.end_token]:
            if par_start != i + 1 and (_end_of_text(sentences, raw_tokens, sentence, token, i)
                                       or _end_of_paragraph(raw_paragraphs, cur_par_idx, token)):
                paragraphs.append(Paragraph(par_start, i + 1))
                par_start = i + 1
                cur_par_idx += 1

    return sentences, paragraphs, entities, _get_relations(raw_relations, entities_dict, symmetric_types)
Пример #6
0
    def _get_docs(self, raw_docs: Dict[str, List[dict]],
                  groups: Dict[str, list]) -> Dict[str, Document]:
        docs = {}
        for doc_id, raw_tokens in raw_docs.items():
            tokens = []
            token_features = {}
            sentences = []
            sent_start = 0
            shift2idx = {}

            for i, raw_token in enumerate(raw_tokens):
                tokens.append(raw_token['token'])
                token_features.setdefault('lemma',
                                          []).append(raw_token['lemma'])
                token_features.setdefault('gram', []).append(raw_token['gram'])
                if "speech" in raw_token:
                    token_features.setdefault("speech",
                                              []).append(raw_token['speech'])
                    token_features.setdefault("said",
                                              []).append(raw_token['said'])
                    token_features.setdefault("author_comment", []).append(
                        raw_token['author_comment'])
                    token_features.setdefault("speech_verb", []).append(
                        raw_token['speech_verb'])
                shift2idx[raw_token['shift']] = i

                if raw_token['gram'] == 'SENT':
                    sentences.append(Sentence(sent_start, i + 1))
                    sent_start = i + 1
            if sentences[-1].end_token != len(tokens):
                sentences.append(Sentence(sent_start, len(tokens)))
            entities = self._get_entities(groups, shift2idx, doc_id)
            sentences = adjust_sentences(sentences, entities)

            doc = Document(doc_id,
                           tokens,
                           sentences, [Paragraph(0, len(sentences))],
                           entities,
                           token_features=token_features)
            docs[doc_id] = doc

        return docs
Пример #7
0
 def test_no_entities(self):
     sentences = [Sentence(0, 2), Sentence(2, 3)]
     entities = []
     got_sentences = adjust_sentences(sentences, entities)
     self.assertEqual(sentences, got_sentences)