def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(reader.dataset_path_iterator(CONLL_PATH))
     expected_paths = [
         str(CONLL_PATH / "subdomain" / "example.gold_conll"),
         str(CONLL_PATH / "subdomain2" / "example.gold_conll"),
     ]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
示例#2
0
 def _ontonotes_subset(
         ontonotes_reader: Ontonotes, file_path: str,
         domain_identifier: str) -> Iterable[OntonotesSentence]:
     """
     Iterates over the Ontonotes 5.0 dataset using an optional domain identifier.
     If the domain identifier is present, only examples which contain the domain
     identifier in the file path are yielded.
     """
     for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
         if domain_identifier is None or f"/{domain_identifier}/" in conll_file:
             yield from ontonotes_reader.sentence_iterator(conll_file)
示例#3
0
    def _read(self, file_path: str):

        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):

            pos_tags = [t for t in sentence.pos_tags]

            tokens = [
                Token(t, None, None, pos_tags[i])
                for i, t in enumerate(sentence.words)
            ]

            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    verb_indices = np.where(np.array(verb_indicator) == 1)[0]

                    if len(verb_indices) > 0:
                        verb_index = int(verb_indices[0])
                        verb = tokens[verb_index]
                    else:
                        verb_index = -1
                        verb = ''

                    for i, tag in enumerate(tags):
                        if tag[0] == 'B':
                            tags[i] = tags[i].replace('B', 'I', 1)
                        if self.used_tags is not None and tags[
                                i] not in self.used_tags:
                            tags[i] = 'O'

                    instance = self.text_to_instance([verb] + tokens,
                                                     [0] + verb_indicator,
                                                     ['O'] + tags)

                    if self.dependency_parse:
                        doc = self.nlp(' '.join(sentence.words))
                        instance.add_field('dependency', MetadataField(doc))

                    instance.add_field(
                        'verb_index', IndexField(verb_index,
                                                 instance['tokens']))
                    yield instance
示例#4
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier,
            )

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    yield self.text_to_instance(tokens, verb_indicator, tags)
示例#5
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            yield self.text_to_instance([s.words for s in sentences], list(clusters.values()))
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info(
            "Reading Fine-Grained NER instances from dataset files at: %s",
            file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier,
            )

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(_normalize_word(t)) for t in sentence.words]
            yield self.text_to_instance(tokens, sentence.named_entities)
 def test_ontonotes_can_read_conll_file_with_multiple_documents(self):
     reader = Ontonotes()
     file_path = FIXTURES_ROOT / "coref" / "coref.gold_conll"
     documents = list(reader.dataset_document_iterator(file_path))
     assert len(documents) == 4
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(
            reader.dataset_iterator(CONLL_PATH / "subdomain"))
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            "Mali",
            "government",
            "officials",
            "say",
            "the",
            "woman",
            "'s",
            "confession",
            "was",
            "forced",
            ".",
        ]
        assert annotation.pos_tags == [
            "NNP",
            "NN",
            "NNS",
            "VBP",
            "DT",
            "NN",
            "POS",
            "NN",
            "VBD",
            "JJ",
            ".",
        ]
        assert annotation.word_senses == [
            None, None, 1, 1, None, 2, None, None, 1, None, None
        ]
        assert annotation.predicate_framenet_ids == [
            None,
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
            "01",
            None,
            None,
        ]
        assert annotation.srl_frames == [
            (
                "say",
                [
                    "B-ARG0",
                    "I-ARG0",
                    "I-ARG0",
                    "B-V",
                    "B-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "I-ARG1",
                    "O",
                ],
            ),
            (
                "was",
                [
                    "O", "O", "O", "O", "B-ARG1", "I-ARG1", "I-ARG1", "I-ARG1",
                    "B-V", "B-ARG2", "O"
                ],
            ),
        ]
        assert annotation.named_entities == [
            "B-GPE",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
        ]
        assert annotation.predicate_lemmas == [
            None,
            None,
            "official",
            "say",
            None,
            "man",
            None,
            None,
            "be",
            None,
            None,
        ]
        assert annotation.speakers == [
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        ]

        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP(NML (NNP Mali)  (NN government) )"
            " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
            " (DT the)  (NN woman)  (POS 's) ) (NN "
            "confession) )(VP (VBD was) (ADJP (JJ "
            "forced) ))))) (. .) ))")
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            "The",
            "prosecution",
            "rested",
            "its",
            "case",
            "last",
            "month",
            "after",
            "four",
            "months",
            "of",
            "hearings",
            ".",
        ]
        assert annotation.pos_tags == [
            "DT",
            "NN",
            "VBD",
            "PRP$",
            "NN",
            "JJ",
            "NN",
            "IN",
            "CD",
            "NNS",
            "IN",
            "NNS",
            ".",
        ]
        assert annotation.word_senses == [
            None,
            2,
            5,
            None,
            2,
            None,
            None,
            None,
            None,
            1,
            None,
            1,
            None,
        ]
        assert annotation.predicate_framenet_ids == [
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            "01",
            None,
        ]
        assert annotation.srl_frames == [
            (
                "rested",
                [
                    "B-ARG0",
                    "I-ARG0",
                    "B-V",
                    "B-ARG1",
                    "I-ARG1",
                    "B-ARGM-TMP",
                    "I-ARGM-TMP",
                    "B-ARGM-TMP",
                    "I-ARGM-TMP",
                    "I-ARGM-TMP",
                    "I-ARGM-TMP",
                    "I-ARGM-TMP",
                    "O",
                ],
            ),
            ("hearings", [
                "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-V",
                "O"
            ]),
        ]
        assert annotation.named_entities == [
            "O",
            "O",
            "O",
            "O",
            "O",
            "B-DATE",
            "I-DATE",
            "O",
            "B-DATE",
            "I-DATE",
            "O",
            "O",
            "O",
        ]
        assert annotation.predicate_lemmas == [
            None,
            "prosecution",
            "rest",
            None,
            "case",
            None,
            None,
            None,
            None,
            "month",
            None,
            "hearing",
            None,
        ]
        assert annotation.speakers == [
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP (DT The)  (NN prosecution) )(VP "
            "(VBD rested) (NP (PRP$ its)  (NN case) )"
            "(NP (JJ last)  (NN month) )(PP (IN after) "
            "(NP(NP (CD four)  (NNS months) )(PP (IN"
            " of) (NP (NNS hearings) ))))) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        # Check we can handle sentences without verbs.
        annotation = annotated_sentences[2]
        assert annotation.document_id == "test/test/03/test_003"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            "Denise", "Dillon", "Headline", "News", "."
        ]
        assert annotation.pos_tags == ["NNP", "NNP", "NNP", "NNP", "."]
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [
            None, None, None, None, None
        ]
        assert annotation.srl_frames == []
        assert annotation.named_entities == [
            "B-PERSON",
            "I-PERSON",
            "B-WORK_OF_ART",
            "I-WORK_OF_ART",
            "O",
        ]
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(FRAG(NP (NNP Denise) "
            " (NNP Dillon) )(NP (NNP Headline)  "
            "(NNP News) ) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1))}

        # Check we can handle sentences with 2 identical verbs.
        annotation = annotated_sentences[3]
        assert annotation.document_id == "test/test/04/test_004"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            "and",
            "that",
            "wildness",
            "is",
            "still",
            "in",
            "him",
            ",",
            "as",
            "it",
            "is",
            "with",
            "all",
            "children",
            ".",
        ]
        assert annotation.pos_tags == [
            "CC",
            "DT",
            "NN",
            "VBZ",
            "RB",
            "IN",
            "PRP",
            ",",
            "IN",
            "PRP",
            "VBZ",
            "IN",
            "DT",
            "NNS",
            ".",
        ]
        assert annotation.word_senses == [
            None,
            None,
            None,
            4.0,
            None,
            None,
            None,
            None,
            None,
            None,
            5.0,
            None,
            None,
            None,
            None,
        ]
        assert annotation.predicate_framenet_ids == [
            None,
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
            None,
            None,
            "01",
            None,
            None,
            None,
            None,
        ]
        assert annotation.srl_frames == [
            (
                "is",
                [
                    "B-ARGM-DIS",
                    "B-ARG1",
                    "I-ARG1",
                    "B-V",
                    "B-ARGM-TMP",
                    "B-ARG2",
                    "I-ARG2",
                    "O",
                    "B-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "I-ARGM-ADV",
                    "O",
                ],
            ),
            (
                "is",
                [
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "O",
                    "B-ARG1",
                    "B-V",
                    "B-ARG2",
                    "I-ARG2",
                    "I-ARG2",
                    "O",
                ],
            ),
        ]
        assert annotation.named_entities == [
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
            "O",
        ]
        assert annotation.predicate_lemmas == [
            None,
            None,
            None,
            "be",
            None,
            None,
            None,
            None,
            None,
            None,
            "be",
            None,
            None,
            None,
            None,
        ]
        assert annotation.speakers == [
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
            "_Avalon_",
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP (S (CC and) (NP (DT that) (NN wildness)) "
            "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP "
            "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) "
            "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS "
            "children))))))) (. .)))")
        assert annotation.coref_spans == {(14, (6, 6))}
示例#9
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier,
            )

        count = 0
        instances = []
        for index, sentence in enumerate(
                self._ontonotes_subset(ontonotes_reader, file_path,
                                       self._domain_identifier)):
            if self._limit > 0 and count >= self._limit and not self._random_sample:
                break
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                count += 1
                instance = self.text_to_instance(tokens, verb_label, tags)
                if self._random_sample and self._limit > 0:
                    instances.append(instance)
                else:
                    yield instance
            else:
                for (_, tags) in sentence.srl_frames:
                    if self._limit > 0 and count >= self._limit and not self._random_sample:
                        break
                    count += 1
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    if self._print_violations:
                        violation = False
                        counts = defaultdict(int)
                        for t in tags:
                            counts[t] += 1
                        for key in list(counts.keys()):
                            if key[:4] in {"B-R-", "B-C-"}:
                                if counts["B-" + key[4:]] == 0:
                                    violation = True
                                    break
                        if violation:
                            logger.info(tokens)
                            logger.info(tags)
                    instance = self.text_to_instance(tokens, verb_indicator,
                                                     tags)
                    if self._random_sample and self._limit > 0:
                        instances.append(instance)
                    else:
                        yield instance
        if self._random_sample and self._limit > 0:
            random.seed(self._random_seed)
            sample = random.sample(instances, self._limit)
            for instance in sample:
                yield instance
示例#10
0
    def _read(self, file_paths: str):
        read_from_pickle = False
        if self._pickle_path is not None and not self._test_run:
            if os.path.exists(self._pickle_path):
                read_from_pickle = True
                f = open(self._pickle_path, 'rb')
                instances = pickle.load(f)
                f.close()
                for instance in instances:
                    yield instance
        if not read_from_pickle:
            file_paths = file_paths.split(",")
            for file_path in file_paths:
                if "parallel" in file_path:
                    f = open(file_path)
                    lines = f.readlines()
                    f.close()
                    for i in range(len(lines) // 2):
                        if self._limit > 0 and i >= self._limit:
                            break
                        sentence1 = [lines[2 * i].strip().split()]
                        if self._parallel_tokenizer is None:
                            # sentence2 = self._parallel_stanza(lines[2*i+1].strip())
                            # sentence2 = [[token["text"] for token in sentence] for sentence in sentence2.to_dict()]
                            assert self._parallel_jieba
                            sentence2 = [[
                                token[0]
                                for token in jieba.tokenize(lines[2 * i +
                                                                  1].strip())
                            ]]
                        else:
                            sentence2 = [
                                self._parallel_tokenizer.tokenize(
                                    lines[2 * i + 1].strip())
                            ]
                        if self._parallel_reverse:
                            tmp = sentence1
                            sentence1 = sentence2
                            sentence2 = tmp
                        instance = self.text_to_instance(
                            sentences=sentence1,
                            document_id=file_path + "_" + str(i),
                            language="parallel",
                            parallel_sentences=sentence2)
                        yield instance
                else:
                    # if `file_path` is a URL, redirect to the cache
                    file_path = cached_path(file_path)
                    language = file_path.split(".")[-2]

                    ontonotes_reader = Ontonotes(multiple_tags=True)
                    instances = []
                    for sentences in ontonotes_reader.dataset_document_iterator(
                            file_path):
                        if self._limit > 0 and len(instances) >= self._limit:
                            break
                        document_id = sentences[0].document_id + "_" + str(
                            sentences[0].sentence_id)
                        if self._individual_sentences:
                            for sentence in sentences:
                                clusters, srl_frames, named_entities, named_entity_spans = self.process_sentences(
                                    [sentence])
                                instance = self.text_to_instance(
                                    sentences=[sentence.words],
                                    document_id=document_id,
                                    gold_clusters=list(clusters.values()),
                                    srl_frames=srl_frames,
                                    named_entities=named_entities,
                                    language=language,
                                    sentence_objects=[sentence],
                                    named_entity_spans=named_entity_spans)
                                if instance is not None and ("srl_labels"
                                                             in instance.fields
                                                             or not self._srl):
                                    instances.append(instance)
                                    yield instance
                        else:
                            clusters, srl_frames, named_entities, named_entity_spans = self.process_sentences(
                                sentences)
                            instance = self.text_to_instance(
                                sentences=[s.words for s in sentences],
                                document_id=document_id,
                                gold_clusters=list(clusters.values()),
                                srl_frames=srl_frames,
                                named_entities=named_entities,
                                language=language,
                                sentence_objects=sentences,
                                named_entity_spans=named_entity_spans)
                            instances.append(instance)
                            yield instance
                        if self._test_run:
                            break
                    if not self._test_run and self._pickle_path is not None:
                        f = open(self._pickle_path, 'wb')
                        pickle.dump(instances, f)
                        f.close()