예제 #1
0
    def test_collapsement_of_same_spans(self):
        tokens = ["Elon", "Musk", "is", "CEO", "of", "Tesla", "."]
        sentences = [Sentence(0, 7)]
        entities = [
            Entity("_", 0, 2, "ELON"),
            Entity("_", 0, 2, "MUSK"),
            Entity("_", 5, 6, "COMP"),
            Entity("_", 5, 6, "ORG")
        ]

        input_doc = Document("_", tokens, sentences, [], entities)

        expected_tokens = ["$ELON$", "is", "CEO", "of", "$COMP$", "."]
        expected_sentences = [Sentence(0, 6)]
        expected_entities = [
            Entity("_", 0, 1, "ELON"),
            Entity("_", 0, 1, "MUSK"),
            Entity("_", 4, 5, "COMP"),
            Entity("_", 4, 5, "ORG")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences, [],
                                expected_entities)

        actual_doc = EntitiesCollapser({"ELON", "COMP"}).transform(input_doc)
        self.assertEqual(expected_doc, actual_doc)
예제 #2
0
    def setUp(self):
        tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",",
            "1979", ",", "10", "people", "in", "the", "Sardinian", "province",
            "of", "Cagliari", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 31)]

        entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 24, "Habitat"),
            Entity("T7", 20, 22, "Geographical"),
            Entity("T8", 23, 24, "Geographical"),
            Entity("T9", 29, 30, "Bacteria")
        ]

        paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        relations = [
            Relation(entities[0], entities[1], "Lives_in"),
            Relation(entities[8], entities[6], "Lives_in")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities,
                            relations)
예제 #3
0
    def setUp(self):
        sent_1_tokens = [
            "Human", "and", "tick", "spotted", "fever", "group", "Rickettsia",
            "isolates", "from", "Israel", ":", "a", "genotypic", "analysis",
            "."
        ]
        sent_1_head_distances = [
            3, -1, -2, 0, 2, 1, 1, 7, -1, -1, 4, 2, 1, 1, -11
        ]

        self.doc_with_1_sent = Document(
            "",
            sent_1_tokens, [Sentence(0, len(sent_1_tokens))],
            [Paragraph(0, 1)],
            token_features={"dt_head_distances": sent_1_head_distances})

        sent_2_tokens = [
            "The", "precise", "mechanisms", "that", "initiate", "bacterial",
            "uptake", "have", "not", "yet", "been", "elucidated", "."
        ]
        sent_2_head_distances = [2, 1, 9, 1, -2, 1, -2, 4, 3, 2, 1, 0, -1]

        self.doc_with_2_sent = Document(
            "",
            sent_1_tokens + sent_2_tokens, [
                Sentence(0, len(sent_1_tokens)),
                Sentence(len(sent_1_tokens),
                         len(sent_1_tokens) + len(sent_2_tokens))
            ], [Paragraph(0, 2)],
            token_features={
                "dt_head_distances":
                sent_1_head_distances + sent_2_head_distances
            })
예제 #4
0
    def test_3_entity_paragraphs(self):
        sentences = [
            Sentence(0, 5),
            Sentence(5, 10),
        ]
        paragraphs = [
            Paragraph(0, 1),
            Paragraph(1, 2),
        ]
        entities = [
            Entity('_', 0, 1, '1'),
            Entity('_', 1, 2, '1'),
            Entity('_', 5, 6, '2'),
        ]
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_samples(doc, max_distance, False)

        expected_samples = [
            (Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), None),
            (Entity('_', 0, 1, '1'), Entity('_', 5, 6, '2'), None),
            (Entity('_', 1, 2, '1'), Entity('_', 5, 6, '2'), None),
        ]

        self.assertEqual(expected_samples, actual_samples)
예제 #5
0
파일: readers.py 프로젝트: wayne9qiu/derek
    def _get_doc_from_raw_text(self, raw_text, doc_name) -> Document:
        tokens, sentences, raw_tokens = self.segmenter.segment(raw_text)

        # here we assume all text to be one paragraph
        paragraphs = [Paragraph(0, len(sentences))]

        return Document(splitext(doc_name)[0], tokens, sentences, paragraphs,  token_features={"char_spans": raw_tokens})
예제 #6
0
파일: readers.py 프로젝트: wayne9qiu/derek
    def _create_doc(self, doc_raw_tokens: List[List[str]], doc_idx) -> Document:
        tokens, sentences, entities, pos_tags = [], [], [], []

        sent_tokens, sent_pos_tags, sent_entities_labels = [], [], []
        sent_start = 0
        for raw_token in doc_raw_tokens:
            if not raw_token:
                if sent_tokens:
                    tokens.extend(sent_tokens)
                    pos_tags.extend(sent_pos_tags)
                    sentences.append(Sentence(sent_start, sent_start + len(sent_tokens)))
                    sent_start += len(sent_tokens)
                    entities.extend(self._decode_strategy.decode_labels(sentences[-1], sent_entities_labels))
                    sent_tokens, sent_pos_tags, sent_entities_labels = [], [], []
                continue

            token, pos_tag, _, ent_label = raw_token
            sent_tokens.append(token)
            sent_entities_labels.append(ent_label)
            sent_pos_tags.append(pos_tag)

        if sent_tokens:
            tokens.extend(sent_tokens)
            pos_tags.extend(sent_pos_tags)
            sentences.append(Sentence(sent_start, sent_start + len(sent_tokens)))
            entities.extend(self._decode_strategy.decode_labels(sentences[-1], sent_entities_labels))

        return Document(
            str(doc_idx), tokens, sentences, [Paragraph(0, len(sentences))], entities, token_features={"pos": pos_tags})
예제 #7
0
파일: readers.py 프로젝트: wayne9qiu/derek
    def read(self, path: str) -> List[Document]:
        all_files = listdir(path)
        file_names = sorted(set(splitext(f)[0] for f in all_files))
        docs = []

        for f in file_names:
            with open(join(path, f"{f}.txt"), "r", encoding="utf-8") as g:
                raw_text = g.read()

            if f"{f}.ann" not in all_files:
                warn(f"Skipping {f}.txt, no {f}.ann file found")
                continue

            with open(join(path, f"{f}.ann"), "r", encoding="utf-8") as g:
                annotations = g.read()

            tokens, sentences, raw_tokens = self.segmenter.segment(raw_text)
            raw_entities, raw_relations = _read_brat_annotations(annotations)
            raw_entities = _expand_spans(raw_entities)
            sentences, _, entities, relations = _merge(raw_tokens, sentences, [], raw_entities, raw_relations)

            if self.collapse_intersecting:
                entities, relations = collapse_intersecting_entities(entities, relations)

            # here we assume all doc to be in one paragraph
            doc = Document(f, tokens, sentences, [Paragraph(0, len(sentences))], entities, relations)
            docs.append(doc)

        return docs
예제 #8
0
    def test_2_chains_2_pron(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, 'noun'),
            Entity('_', 1, 2, 'pron'),
            Entity('_', 2, 3, 'pron'),
            Entity('_', 3, 4, 'noun'),
            Entity('_', 5, 6, 'noun'),
        ]
        rels = {
            Relation(Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'),
                     '1'),
            Relation(Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'),
                     '1'),
            Relation(Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'),
                     '1'),
        }
        doc = Document('test', [], sentences, paragraphs, entities, rels)

        max_distance = 3

        actual_samples = get_pron_samples(doc, max_distance, True)
        expected_samples = [
            (Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), None),
            (Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'),
            (Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'),
            (Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), None),
        ]
        self.assertEqual(actual_samples, expected_samples)
예제 #9
0
    def test_inner_entities_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7",
            ",", "1979", ",", "10", "people", "in", "the", "$Geographical$",
            "of", "$Geographical$", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        expected_sentences = [Sentence(0, 7), Sentence(7, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 23, "Habitat"),
            Entity("T7", 20, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)
예제 #10
0
    def test_vectorized_features(self):
        doc = Document('', ['Planning', 'of', 'work', 'by', 'Elon'], [], [],
                       token_features={
                           "vectors": [
                               np.array([1, 2]),
                               np.array([1, 1]),
                               np.array([0, 0]),
                               np.array([0, 1]),
                               np.array([9, 10])
                           ]
                       })
        token_fe, token_meta = generate_token_feature_extractor(
            [doc], {"vectors_keys": ["vectors"]})
        features = token_fe.extract_features_from_doc(doc, 1, 4)
        vectors = features['vectors']

        self.assertEqual(token_meta.get_precomputed_features(), [])
        self.assertEqual(len(token_meta.get_embedded_features()), 0)
        self.assertEqual(token_meta.get_one_hot_features(), [])
        self.assertEqual(token_meta.get_vectorized_features(), [{
            "name": "vectors",
            "size": 2
        }])
        self.assertEqual(token_meta.get_char_features(), [])

        self.assertEqual(features['seq_len'], 3)
        self.assertEqual(len(vectors), 3)

        self.assertEqual(doc.token_features["vectors"][1:4], vectors)
def _get_sentence_positions_to_span(doc: Document, start_token: int,
                                    end_token: int, wrt_span):
    _, _, sent_idx = wrt_span
    return [
        sent_idx - doc.get_token_sent_idx(idx)
        for idx in range(start_token, end_token)
    ]
예제 #12
0
    def test_entities_with_nesting_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From",
            "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people",
            "in", "the", "Sardinian", "province", "of", "Cagliari", "had",
            "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "."
        ]
        expected_sentences = [Sentence(0, 6), Sentence(6, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 3, "Bacteria"),
            Entity("T3", 2, 3, "Bacteria"),
            Entity("T4", 4, 5, "Geographical"),
            Entity("T5", 16, 17, "Habitat"),
            Entity("T6", 16, 23, "Habitat"),
            Entity("T7", 19, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)
예제 #13
0
def convert_from_digger_to_derek(diggerdoc: DiggerDoc,
                                 doc_name: str) -> Document:
    tokens = []
    token_features = {
        "pos": [],
        "dt_labels": [],
        "dt_head_distances": [],
        "lemmas": [],
        "feats": []
    }

    for i, token in enumerate(diggerdoc.tokens):
        tokens.append(token.doc_text)
        token_features["pos"].append(token.pos.upos)
        token_features["dt_labels"].append(token.deprel)
        token_features["dt_head_distances"].append(
            token.head_index - i if token.head_index != -1 else 0)
        token_features["lemmas"].append(token.lemma)
        token_features["feats"].append(token.pos.feats)

    sentences = list(
        Sentence(sent.start, sent.end)
        for sent in diggerdoc.sentences_boundaries)
    # here we assume all doc sentences to be in 1 paragraph
    paragraphs = [Paragraph(0, len(sentences))]

    return Document(doc_name,
                    tokens,
                    sentences,
                    paragraphs,
                    token_features=token_features)
예제 #14
0
    def test_borders_extraction_3(self):
        tokens = ["bacteria", "spotted", ".", ".", "it's", "."]
        sentences = [Sentence(0, 3), Sentence(3, 4), Sentence(4, 6)]
        broken_doc = Document("", tokens, sentences, [Paragraph(0, 2)])

        borders = ["start", "in", "end", "start", "start", "end"]
        self.assertEqual(get_sentence_borders_feature(broken_doc), borders)
예제 #15
0
파일: readers.py 프로젝트: wayne9qiu/derek
    def _build_document(self, pmid, abstract, raw_entities, raw_relations) -> Document:
        tokens, sentences, raw_tokens = self.segmenter.segment(abstract['text'])
        raw_paragraphs = abstract['paragraphs']
        sentences, paragraphs, entities, relations = _merge(
            raw_tokens, sentences, raw_paragraphs, raw_entities, raw_relations)

        return Document(pmid, tokens, sentences, paragraphs, entities, relations)
예제 #16
0
    def _extract_features(self,
                          doc: Document,
                          ent: Entity,
                          include_labels=False):
        ent_sent_idx = doc.get_entity_sent_idx(ent)

        start_token = doc.sentences[ent_sent_idx].start_token
        end_token = doc.sentences[ent_sent_idx].end_token

        features = {
            **self.token_feature_extractor.extract_features_from_doc(
                doc, start_token, end_token),
            **self._get_attention_features(doc, ent, start_token, end_token),
            **self._get_classifier_features(doc, ent)
        }

        if include_labels:
            label = _map_ne(doc, ent)
            features["labels"] = self.label_converter[label]

        ent_mask = [0] * len(self.label_converter)
        for key in self.types_mapping[ent.type]:
            ent_mask[self.label_converter[key]] = 1

        features["labels_mask"] = ent_mask

        features["indices"] = [[
            ent.start_token - start_token, ent.end_token - start_token
        ]]

        return features
예제 #17
0
    def test_gazetteer_features(self):
        doc = Document(
            '', ['Ваня', 'едет', 'в', 'Париж', 'из', 'Москвы', 'в', 'москву'],
            [], [],
            token_features={
                'lemmas':
                ['Ваня', 'ехать', 'в', 'Париж', 'из', 'Москва', 'в', 'москва']
            })
        token_fe, token_meta = generate_token_feature_extractor(
            [doc], {
                "gazetteers": [{
                    "path": "tests/data/feature_extractor/gazetteer.txt",
                    "lower": True,
                    'lemmatize': True
                }, {
                    "path": "tests/data/feature_extractor/gazetteer.txt",
                    "lower": False,
                    'lemmatize': True
                }, {
                    "path": "tests/data/feature_extractor/gazetteer.txt",
                    "lower": True,
                    'lemmatize': False
                }, {
                    "path": "tests/data/feature_extractor/gazetteer.txt",
                    "lower": False,
                    'lemmatize': False
                }]
            })
        features = token_fe.extract_features_from_doc(doc, 0, 8)

        self.assertEqual(features['gazetteer_0'], [1, 1, 1, 2, 1, 2, 1, 2])
        self.assertEqual(features['gazetteer_1'], [1, 1, 1, 2, 1, 2, 1, 1])
        self.assertEqual(features['gazetteer_2'], [1, 1, 1, 2, 1, 1, 1, 1])
        self.assertEqual(features['gazetteer_3'], [1, 1, 1, 2, 1, 1, 1, 1])
예제 #18
0
 def test_gazetteer_features_assert(self):
     doc = Document('', ['Planning', 'of', 'work', 'by', 'Elon', ""], [],
                    [],
                    token_features={})
     self.assertRaises(Exception, generate_token_feature_extractor, [doc],
                       {"gazetteers": [{
                           "path": ""
                       }]})
예제 #19
0
    def test_no_features(self):
        doc = Document('', ['Go', 'to', 'shop'], [], [])
        tp_fe, tp_meta = generate_token_position_feature_extractor({})
        features = tp_fe.extract_features_from_doc(doc, 0, 3, (2, 3, 0))

        self.assertEqual(tp_meta.get_embedded_features(), [])
        self.assertEqual(tp_meta.get_one_hot_features(), [])
        self.assertEqual(tp_meta.get_vectorized_features(), [])
        self.assertEqual(features, {})
예제 #20
0
    def test_collapsing_with_ne(self):
        input_doc = self.doc.with_additional_extras({"ne": self.doc.entities})
        input_doc = input_doc.without_relations().without_entities()

        entities = SortedSpansSet([
            Entity("_", 0, 1, "left"),
            Entity("_", 2, 4, "same"),
            Entity("_", 3, 4, "include"),
            Entity("_", 5, 6, "same"),
            Entity("_", 15, 19, "intersect"),
            Entity("_", 17, 20, "include"),
            Entity("_", 22, 25, "intersect")
        ])

        input_doc = input_doc.with_entities(entities)

        expected_tokens = [
            "Recurrence", "of", "$Bacteria$", "in", "$Geographical$", ".",
            "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10",
            "$Habitat$", "had", "onset", "of", "bacteriologically",
            "confirmed", "$Bacteria$", "."
        ]
        expected_sentences = [Sentence(0, 6), Sentence(6, 24)]
        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]

        expected_nes = SortedSpansSet([
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 3, "Bacteria"),
            Entity("T3", 2, 3, "Bacteria"),
            Entity("T4", 4, 5, "Geographical"),
            Entity("T5", 16, 17, "Habitat"),
            Entity("T6", 16, 17, "Habitat"),
            Entity("T7", 16, 17, "Geographical"),
            Entity("T8", 16, 17, "Geographical"),
            Entity("T9", 22, 23, "Bacteria")
        ])

        expected_entities = SortedSpansSet([
            Entity("_", 0, 1, "left"),
            Entity("_", 2, 3, "same"),
            Entity("_", 2, 3, "include"),
            Entity("_", 4, 5, "same"),
            Entity("_", 14, 17, "intersect"),
            Entity("_", 16, 17, "include"),
            Entity("_", 16, 18, "intersect")
        ])

        expected_doc = Document("_",
                                expected_tokens,
                                expected_sentences,
                                expected_paragraphs,
                                expected_entities,
                                extras={"ne": expected_nes})

        actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"},
                                       True).transform(input_doc)
        self.assertEqual(expected_doc, actual_doc)
예제 #21
0
    def test_without_labels(self):
        ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")]
        doc = Document('',
                       ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"],
                       [Sentence(0, 7)], [], ents)

        ner_fe, token_meta = ner_fe_factory([doc], {"internal_emb_size": 10})
        doc = doc.without_entities()
        features, = ner_fe.extract_features_from_doc(doc)

        words = features['words_0']
        self.assertEqual(features['seq_len'], 7)
        self.assertEqual(len(words), 7)

        self.assertNotEqual(words[0], words[1])  # Planning of
        self.assertEqual(words[1], words[3])  # of of

        self.assertRaises(KeyError, lambda: features['labels'])
예제 #22
0
    def process_doc(self, doc: Document) -> Document:
        new_entities = []

        for ent in doc.entities:
            if ent.type in self.__filter:
                continue

            new_entities.append(ent.with_type(self.__replacements.get(ent.type, ent.type)))

        return doc.without_relations().without_entities().with_entities(new_entities)
예제 #23
0
파일: readers.py 프로젝트: wayne9qiu/derek
    def _read_document(self, path: str, name: str):
        with open(join(path, name + ".txt"), encoding="utf-8") as f:
            raw_text = f.read()

        tokens, sentences, raw_tokens = self.segmenter.segment(raw_text)
        raw_entities, raw_paragraphs, raw_relations = BioNLPDataReader._read_annotations(path, name)
        sentences, paragraphs, entities, relations = _merge(
            raw_tokens, sentences, raw_paragraphs, raw_entities, raw_relations,
            symmetric_types=BioNLPDataReader.SYMMETRIC_RELATION_TYPES)

        return Document(name, tokens, sentences, paragraphs, entities, relations)
예제 #24
0
    def setUp(self) -> None:
        tokens = [
            "I", "will", "do", "my", "homework", "today", ".", "It", "is",
            "very", "hard", "but", "i", "don't", "care", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 16)]
        paragraphs = [Paragraph(0, 2)]
        entities = [
            Entity("_", 0, 1, "t1"),
            Entity("_", 3, 5, "t2"),
            Entity("_", 7, 8, "t1"),
            Entity("_", 9, 11, "t2"),
            Entity("_", 10, 11, "t4")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities)
        self.relations = {
            Relation(entities[2], entities[3], "t1"),
            Relation(entities[3], entities[4], "t2")
        }
예제 #25
0
    def setUp(self) -> None:
        tokens = ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon", "in", "LA", "in", "USA", "."]
        sents = [Sentence(0, 12)]
        ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER"), Entity("_", 8, 9, "ORG"), Entity("_", 10, 11, "ORG")]
        nes = SortedSpansSet([
                Entity("gen", 0, 1, "STUFF"),
                Entity("gen", 4, 5, "PERORG"), Entity("gen", 6, 7, "PERORG"),
                Entity("gen", 8, 9, "PERORG"), Entity("gen", 10, 11, "PERORG")
        ])

        self.doc = Document('', tokens, sents, [], ents, extras={"ne": nes})
예제 #26
0
    def test_1_entity(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [Entity('_', 0, 1, '1')]
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_samples(doc, max_distance, False)
        expected_samples = []

        self.assertEqual(expected_samples, actual_samples)
예제 #27
0
    def test_no_entities(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = []
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_noun_samples(doc, max_distance, False)
        expected_samples = []

        self.assertEqual(actual_samples, expected_samples)
예제 #28
0
def _conllu_text(text) -> str:
    tokens, sentences, _ = segmentor.segment(text)
    token_features = processor.get_token_features(tokens, sentences)
    # assume doc as paragraph
    paragraphs = [Paragraph(0, len(sentences))]

    doc = Document("",
                   tokens,
                   sentences,
                   paragraphs,
                   token_features=token_features)
    return writer.write_to_str(convert_from_derek_to_digger(doc))
예제 #29
0
    def test_no_features(self):
        doc = Document('', ['Go', 'to', 'shop'], [], [])
        token_fe, token_meta = generate_token_feature_extractor([doc], {})
        features = token_fe.extract_features_from_doc(doc, 0, 2)

        self.assertEqual(token_meta.get_precomputed_features(), [])
        self.assertEqual(token_meta.get_embedded_features(), [])
        self.assertEqual(token_meta.get_one_hot_features(), [])
        self.assertEqual(token_meta.get_vectorized_features(), [])
        self.assertEqual(token_meta.get_char_features(), [])

        self.assertEqual(features, {'seq_len': 2})
예제 #30
0
 def test_ne_features(self):
     ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")]
     doc = Document('',
                    ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"],
                    [Sentence(0, 7)], [],
                    extras={'ne': SortedSpansSet(ents)})
     fe, meta = ne_fe_factory([doc], {"ne_emb_size": 10})
     features = fe.extract_features_from_doc(doc, 3, 7)['ne']
     self.assertEqual(len(meta.get_embedded_features()), 1)
     self.assertEqual(len(features), 4)
     self.assertEqual(features[0], features[2])  # O O
     self.assertEqual(features[1], features[3])  # I-PER I-PER
     self.assertNotEqual(features[0], features[1])  # O I-PER