示例#1
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            new_sentences = [s.words for s in sentences]
            flattened_sentences = [
                self._normalize_word(word) for sentence in new_sentences
                for word in sentence
            ]

            def tokenizer(s: str):
                return self.token_indexer.wordpiece_tokenizer(s)

            flattened_sentences = tokenizer(" ".join(flattened_sentences))
            yield self.text_to_instance([s.words for s in sentences],
                                        canonical_clusters)
示例#2
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(reader.dataset_path_iterator('tests/fixtures/conll_2012/'))
     expected_paths = ['tests/fixtures/conll_2012/subdomain/example.gold_conll',
                       'tests/fixtures/conll_2012/subdomain2/example.gold_conll']
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        instances = []
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)

        for sentence in ontonotes_reader.dataset_iterator(file_path):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                instances.append(
                    self.text_to_instance(tokens, verb_label, tags))
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    instances.append(
                        self.text_to_instance(tokens, verb_indicator, tags))

        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
示例#4
0
def main(args):
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ontonotes",
        type=str,
        required=True,
        help="Path to OntoNotes, e.g. /path/to/conll-formatted-ontonotes-5.0",
    )
    parser.add_argument("--tasks",
                        type=str,
                        nargs="+",
                        help="Tasks, one or more of {const, coref, ner, srl}.")
    parser.add_argument(
        "--splits",
        type=str,
        nargs="+",
        default=["train", "development", "test", "conll-2012-test"],
        help=
        "Splits, one or more of {train, development, test, conll-2012-test}.",
    )
    parser.add_argument("-o",
                        dest="output_dir",
                        type=str,
                        default=".",
                        help="Output directory for JSON files.")
    args = parser.parse_args(args)

    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    import pandas as pd

    pd.options.display.float_format = "{:.2f}".format

    # Load OntoNotes reader.
    ontonotes = Ontonotes()
    for split in args.splits:
        for task in args.tasks:
            source_path = os.path.join(args.ontonotes, "data", split)
            print('########### Reading ontonotes split from', source_path)
            ontonotes_reader = ontonotes.dataset_iterator(
                file_path=source_path)

            log.info("Processing split '%s' for task '%s'", split, task)
            task_dir = os.path.join(args.output_dir, task)
            if not os.path.isdir(task_dir):
                os.mkdir(task_dir)
            target_fname = os.path.join(task_dir, f"{split}.json")
            ontonotes_stats = collections.Counter()
            converted_records = process_task_split(tqdm(ontonotes_reader),
                                                   task, ontonotes_stats)

            stats = utils.EdgeProbingDatasetStats()
            converted_records = stats.passthrough(converted_records)
            utils.write_json_data(target_fname, converted_records)
            log.info("Wrote examples to %s", target_fname)
            log.info(stats.format())
            log.info(str(pd.Series(ontonotes_stats, dtype=object)))
示例#5
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(reader.dataset_path_iterator(self.FIXTURES_ROOT / 'conll_2012'))
     expected_paths = [str(self.FIXTURES_ROOT / 'conll_2012' / 'subdomain' / 'example.gold_conll'),
                       str(self.FIXTURES_ROOT / 'conll_2012' / 'subdomain2' / 'example.gold_conll')]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
示例#6
0
文件: conll.py 项目: mhrmm/allennlp
    def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        ontonotes_reader = Ontonotes()
        for sentences in tqdm(ontonotes_reader.dataset_document_iterator(file_path)):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            instance = self.text_to_instance([s.words for s in sentences], canonical_clusters)
            instances.append(instance)

        if not instances:
            raise ConfigurationError("No instances were read from the given filepath {}. "
                                     "Is the path correct?".format(file_path))
        return Dataset(instances)
示例#7
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            speakers = []
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

                speakers.append(sentence.speakers)

            doc_key = sentences[0].document_id
            genre = self.genres[doc_key[:2]]

            speakers = self.flatten(speakers)
            assert total_tokens == len(speakers)

            speaker_dict = {s: i for i, s in enumerate(set(speakers))}
            speaker_ids = np.array([speaker_dict[s] for s in speakers])

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences], canonical_clusters, speaker_ids, genre)
示例#8
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        i = 0
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)

            percent_user_spans = 0.0
            if self._simulate_user_inputs and i >= self._fully_labelled_threshold:
                percent_user_spans = 1.0

            i += 1

            yield self.text_to_instance([s.words for s in sentences],
                                        sentences[0].document_id,
                                        sentences[0].sentence_id,
                                        canonical_clusters, percent_user_spans)
示例#9
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(
         reader.dataset_path_iterator('tests/fixtures/conll_2012/'))
     assert files == [
         'tests/fixtures/conll_2012/subdomain/example.gold_conll',
         'tests/fixtures/conll_2012/subdomain2/example.gold_conll'
     ]
示例#10
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(reader.dataset_path_iterator(self.FIXTURES_ROOT / "conll_2012"))
     expected_paths = [
         str(self.FIXTURES_ROOT / "conll_2012" / "subdomain" / "example.gold_conll"),
         str(self.FIXTURES_ROOT / "conll_2012" / "subdomain2" / "example.gold_conll"),
     ]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
示例#11
0
 def test_dataset_path_iterator(self):
     reader = Ontonotes()
     files = list(
         reader.dataset_path_iterator('tests/fixtures/conll_2012/'))
     expected_paths = [
         'tests/fixtures/conll_2012/subdomain/example.gold_conll',
         'tests/fixtures/conll_2012/subdomain2/example.gold_conll'
     ]
     assert len(files) == len(expected_paths)
     assert set(files) == set(expected_paths)
示例#12
0
 def _ontonotes_subset(ontonotes_reader: Ontonotes,
                       file_path: str,
                       domain_identifier: str) -> Iterable[OntonotesSentence]:
     """
     Iterates over the Ontonotes 5.0 dataset using an optional domain identifier.
     If the domain identifier is present, only examples which contain the domain
     identifier in the file path are yielded.
     """
     for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
         if (domain_identifier is None or f"/{domain_identifier}/" in conll_file) and "/pt/" not in conll_file:
             yield from ontonotes_reader.sentence_iterator(conll_file)
示例#13
0
 def _ontonotes_subset(
         ontonotes_reader: Ontonotes, file_path: str,
         domain_identifier: str) -> Iterable[OntonotesSentence]:
     """
     Iterates over the Ontonotes 5.0 dataset using an optional domain identifier.
     If the domain identifier is present, only examples which contain the domain
     identifier in the file path are yielded.
     """
     for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
         if domain_identifier is None or f"/{domain_identifier}/" in conll_file:
             yield from ontonotes_reader.sentence_iterator(conll_file)
    def _read(self, file_path: str):
        """OntoNotes custom reader to load spans from dependency pares tree as well"""
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):

            # skip samples without dep' parse tree
            if not sentence.parse_tree:
                continue

            # extract dep' parse tree spans
            spans = set()
            for subtree in sentence.parse_tree.subtrees():
                if subtree.height() > 0:
                    # TODO: check how to output indices instead of words
                    #  (for extreme cases where different tuples could match)
                    spans.add(tuple(subtree.leaves()))

            tokens = [Token(t) for t in sentence.words]
            if sentence.srl_frames:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    yield self.text_to_instance_with_spans(
                        tokens, verb_indicator, tags, spans)
示例#15
0
    def _read(self, file_path: str):

        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):

            pos_tags = [t for t in sentence.pos_tags]

            tokens = [
                Token(t, None, None, pos_tags[i])
                for i, t in enumerate(sentence.words)
            ]

            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    verb_indices = np.where(np.array(verb_indicator) == 1)[0]

                    if len(verb_indices) > 0:
                        verb_index = int(verb_indices[0])
                        verb = tokens[verb_index]
                    else:
                        verb_index = -1
                        verb = ''

                    for i, tag in enumerate(tags):
                        if tag[0] == 'B':
                            tags[i] = tags[i].replace('B', 'I', 1)
                        if self.used_tags is not None and tags[
                                i] not in self.used_tags:
                            tags[i] = 'O'

                    instance = self.text_to_instance([verb] + tokens,
                                                     [0] + verb_indicator,
                                                     ['O'] + tags)

                    if self.dependency_parse:
                        doc = self.nlp(' '.join(sentence.words))
                        instance.add_field('dependency', MetadataField(doc))

                    instance.add_field(
                        'verb_index', IndexField(verb_index,
                                                 instance['tokens']))
                    yield instance
示例#16
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    # for i in range(len(tags)):
                    #     if tags[i] != 'O':
                    # tags[i] = 'I-ARG1'
                    yield self.text_to_instance(tokens, verb_indicator, tags)
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s", file_path)

        for sentence in ontonotes_reader.dataset_iterator(file_path):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags]
                    yield self.text_to_instance(tokens, verb_indicator, tags)
示例#18
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        #Pdb().set_trace()
        data_split = os.path.basename(os.path.normpath(file_path))

        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        # Set random seed if percent is not 100
        if (self.percent_data < 100):
            random.seed(self.random_data_seed)

        # Write sentence, parse tree, span matrix to file
        # fout = open(f"srl_spans_{data_split}.pkl", "wb")

        print(f"return_labels: {self.return_labels}")

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            if (self.percent_data < 100 and data_split == "train"):
                select_data = random.randint(1, 101)
                if (select_data > self.percent_data):
                    continue
            tokens = [Token(t) for t in sentence.words]
            parseTree = sentence.parse_tree

            # Convert tree to span list

            if not sentence.srl_frames:
                # Sentence contains no predicates.
                verb_label = [0 for _ in tokens]
                if self.return_labels:
                    tags = ["O" for _ in tokens]
                    yield self.text_to_instance(tokens, verb_label, parseTree,
                                                tags)
                else:
                    yield self.text_to_instance(tokens, verb_label, parseTree,
                                                None)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    if self.return_labels:
                        yield self.text_to_instance(tokens, verb_indicator,
                                                    parseTree, tags)
                    else:
                        yield self.text_to_instance(tokens, verb_indicator,
                                                    parseTree, None)
示例#19
0
文件: conll.py 项目: pyknife/allennlp
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
示例#20
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]

            ##########################
            result = self.dependency_tree_predictor.predict(
                sentence=" ".join(sentence.words))
            # print(result['words'])
            root_dict = result['hierplane_tree']['root']
            adj = {}
            self.traverse_tree(adj, root_dict['word'], root_dict)
            predicte_adj = {}
            #########################
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, adj, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    #############################################
                    verb_index = verb_indicator.index(1)
                    predicte = sentence.words[verb_index]

                    if predicte in adj:
                        predicte_adj[predicte] = adj[predicte]
                        # 这里可能会造成死循环
                        for i in predicte_adj[predicte]:
                            if i in adj:
                                for j in adj[i]:
                                    predicte_adj[predicte].append(j)
                        yield self.text_to_instance(tokens, verb_indicator,
                                                    predicte_adj, tags)
                    else:
                        # print(" ".join(sentence.words))
                        # print(adj)
                        yield self.text_to_instance(tokens, verb_indicator,
                                                    adj, tags)
示例#21
0
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences],
                                        canonical_clusters)
示例#22
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info(
            "Reading Fine-Grained NER instances from dataset files at: %s",
            file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(_normalize_word(t)) for t in sentence.words]
            yield self.text_to_instance(tokens, sentence.named_entities)
示例#23
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]

            ##########################
            result = self.dependency_tree_predictor.predict(
                sentence=" ".join(sentence.words))
            predicted_heads = result["predicted_heads"]
            #########################
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, adj, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    verb_index = verb_indicator.index(1)
                    # #############################################
                    adj = {}
                    self.traverse_predicted_heads(adj, predicted_heads,
                                                  verb_index + 1)
                    # 有些动词没有关系,防止在后面listfield中出错
                    adj[verb_index + 1].append(verb_index + 1)
                    ##############################################
                    # verb_index = verb_indicator.index(1)
                    # for i in range(len(tags)):
                    #     if '0' in tags[i]:
                    #         tags[i] = 'B-ARG0'
                    #     elif tags[i] != 'O' and i != verb_index:
                    #         tags[i] = 'B-ARG1'
                    yield self.text_to_instance(tokens, verb_indicator, adj,
                                                tags)
示例#24
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        logger.info(
            "Reading SRL instances along with constituent parse from data files at: %s",
            file_path)

        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]

            parse = sentence.parse_tree
            if parse:
                pos_tags = [x[1] for x in parse.pos()]
                # yield self.text_to_instance(parse.leaves(), [x[1] for x in parse.pos()], parse)
            else:
                # parse information is missing for this sentence
                parse = None
                pos_tags = None

            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags, pos_tags,
                                            parse)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    yield self.text_to_instance(tokens, verb_indicator, tags,
                                                pos_tags, parse)
示例#25
0
    def _read(self, file_path: str):
        file_path = cached_path(
            file_path)  # if `file_path` is a URL, redirect to the cache
        ontonotes_reader = Ontonotes()
        logger.info("Reading NER instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.named_entities:
                tags = ["O" for _ in tokens]
            else:
                tags = sentence.named_entities

            if self._coding_scheme == "BIOUL":
                tags = iob1_to_bioul(tags)

            yield self.text_to_instance(tokens, tags)
示例#26
0
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(
            reader.dataset_iterator('tests/fixtures/conll_2012/subdomain/'))
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'Mali', 'government', 'officials', 'say', 'the', 'woman', "'s",
            'confession', 'was', 'forced', '.'
        ]
        assert annotation.pos_tags == [
            'NNP', 'NN', 'NNS', 'VBP', 'DT', 'NN', 'POS', 'NN', 'VBD', 'JJ',
            '.'
        ]
        assert annotation.word_senses == [
            None, None, 1, 1, None, 2, None, None, 1, None, None
        ]
        assert annotation.predicate_framenet_ids == [
            None, None, None, '01', None, None, None, None, '01', None, None
        ]
        assert annotation.srl_frames == [("say", [
            'B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1',
            'I-ARG1', 'I-ARG1', 'I-ARG1', 'O'
        ]),
                                         ("was", [
                                             'O', 'O', 'O', 'O', 'B-ARG1',
                                             'I-ARG1', 'I-ARG1', 'I-ARG1',
                                             'B-V', 'B-ARG2', 'O'
                                         ])]
        assert annotation.named_entities == [
            'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'
        ]
        assert annotation.predicate_lemmas == [
            None, None, 'official', 'say', None, 'man', None, None, 'be', None,
            None
        ]
        assert annotation.speakers == [
            None, None, None, None, None, None, None, None, None, None, None
        ]

        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP(NML (NNP Mali)  (NN government) )"
            " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
            " (DT the)  (NN woman)  (POS 's) ) (NN "
            "confession) )(VP (VBD was) (ADJP (JJ "
            "forced) ))))) (. .) ))")
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'The', 'prosecution', 'rested', 'its', 'case', 'last', 'month',
            'after', 'four', 'months', 'of', 'hearings', '.'
        ]
        assert annotation.pos_tags == [
            'DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN', 'IN', 'CD', 'NNS',
            'IN', 'NNS', '.'
        ]
        assert annotation.word_senses == [
            None, 2, 5, None, 2, None, None, None, None, 1, None, 1, None
        ]
        assert annotation.predicate_framenet_ids == [
            None, None, '01', None, None, None, None, None, None, None, None,
            '01', None
        ]
        assert annotation.srl_frames == [('rested', [
            'B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'B-ARGM-TMP',
            'I-ARGM-TMP', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP',
            'I-ARGM-TMP', 'I-ARGM-TMP', 'O'
        ]),
                                         ('hearings', [
                                             'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'O', 'O', 'B-V', 'O'
                                         ])]
        assert annotation.named_entities == [
            'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-DATE',
            'I-DATE', 'O', 'O', 'O'
        ]
        assert annotation.predicate_lemmas == [
            None, 'prosecution', 'rest', None, 'case', None, None, None, None,
            'month', None, 'hearing', None
        ]
        assert annotation.speakers == [
            None, None, None, None, None, None, None, None, None, None, None,
            None, None
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(S(NP (DT The)  (NN prosecution) )(VP "
            "(VBD rested) (NP (PRP$ its)  (NN case) )"
            "(NP (JJ last)  (NN month) )(PP (IN after) "
            "(NP(NP (CD four)  (NNS months) )(PP (IN"
            " of) (NP (NNS hearings) ))))) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        # Check we can handle sentences without verbs.
        annotation = annotated_sentences[2]
        assert annotation.document_id == 'test/test/03/test_003'
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'Denise', 'Dillon', 'Headline', 'News', '.'
        ]
        assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.']
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [
            None, None, None, None, None
        ]
        assert annotation.srl_frames == []
        assert annotation.named_entities == [
            'B-PERSON', 'I-PERSON', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O'
        ]
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP(FRAG(NP (NNP Denise) "
            " (NNP Dillon) )(NP (NNP Headline)  "
            "(NNP News) ) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1))}

        # Check we can handle sentences with 2 identical verbs.
        annotation = annotated_sentences[3]
        assert annotation.document_id == 'test/test/04/test_004'
        assert annotation.sentence_id == 0
        assert annotation.words == [
            'and', 'that', 'wildness', 'is', 'still', 'in', 'him', ',', 'as',
            'it', 'is', 'with', 'all', 'children', '.'
        ]
        assert annotation.pos_tags == [
            'CC', 'DT', 'NN', 'VBZ', 'RB', 'IN', 'PRP', ',', 'IN', 'PRP',
            'VBZ', 'IN', 'DT', 'NNS', '.'
        ]
        assert annotation.word_senses == [
            None, None, None, 4.0, None, None, None, None, None, None, 5.0,
            None, None, None, None
        ]
        assert annotation.predicate_framenet_ids == [
            None, None, None, '01', None, None, None, None, None, None, '01',
            None, None, None, None
        ]
        assert annotation.srl_frames == [('is', [
            'B-ARGM-DIS', 'B-ARG1', 'I-ARG1', 'B-V', 'B-ARGM-TMP', 'B-ARG2',
            'I-ARG2', 'O', 'B-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV',
            'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'O'
        ]),
                                         ('is', [
                                             'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'B-ARG1', 'B-V',
                                             'B-ARG2', 'I-ARG2', 'I-ARG2', 'O'
                                         ])]
        assert annotation.named_entities == [
            'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
            'O', 'O'
        ]
        assert annotation.predicate_lemmas == [
            None, None, None, 'be', None, None, None, None, None, None, 'be',
            None, None, None, None
        ]
        assert annotation.speakers == [
            '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_',
            '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_',
            '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_'
        ]
        assert annotation.parse_tree == Tree.fromstring(
            "(TOP (S (CC and) (NP (DT that) (NN wildness)) "
            "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP "
            "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) "
            "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS "
            "children))))))) (. .)))")
        assert annotation.coref_spans == {(14, (6, 6))}
示例#27
0
 def test_ontonotes_can_read_conll_file_with_multiple_documents(self):
     reader = Ontonotes()
     file_path = 'tests/fixtures/coref/coref.gold_conll'
     documents = list(reader.dataset_document_iterator(file_path))
     assert len(documents) == 2
示例#28
0
 def _ontonotes_subset(
         ontonotes_reader: Ontonotes, file_path: str,
         domain_identifier: str) -> Iterable[OntonotesSentence]:
     for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
         yield from ontonotes_reader.sentence_iterator(conll_file)
示例#29
0
 def test_ontonotes_can_read_conll_file_with_multiple_documents(self):
     reader = Ontonotes()
     file_path = 'tests/fixtures/coref/coref.gold_conll'
     documents = list(reader.dataset_document_iterator(file_path))
     assert len(documents) == 2
示例#30
0
 def test_ontonotes_can_read_conll_file_with_multiple_documents(self):
     reader = Ontonotes()
     file_path = self.FIXTURES_ROOT / 'coref' / 'coref.gold_conll'
     documents = list(reader.dataset_document_iterator(file_path))
     assert len(documents) == 2
示例#31
0
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(reader.dataset_iterator('tests/fixtures/conll_2012/subdomain/'))
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == ['Mali', 'government', 'officials', 'say', 'the', 'woman',
                                    "'s", 'confession', 'was', 'forced', '.']
        assert annotation.pos_tags == ['NNP', 'NN', 'NNS', 'VBP', 'DT',
                                       'NN', 'POS', 'NN', 'VBD', 'JJ', '.']
        assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, '01', None,
                                                     None, None, None, '01', None, None]
        assert annotation.srl_frames == [("say", ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                                  'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O']),
                                         ("was", ['O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1',
                                                  'I-ARG1', 'B-V', 'B-ARG2', 'O'])]
        assert annotation.named_entities == ['B-GPE', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, None, 'official', 'say', None,
                                               'man', None, None, 'be', None, None]
        assert annotation.speakers == [None, None, None, None, None, None,
                                       None, None, None, None, None]

        assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP(NML (NNP Mali)  (NN government) )"
                                                        " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
                                                        " (DT the)  (NN woman)  (POS 's) ) (NN "
                                                        "confession) )(VP (VBD was) (ADJP (JJ "
                                                        "forced) ))))) (. .) ))")
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month',
                                    'after', 'four', 'months', 'of', 'hearings', '.']
        assert annotation.pos_tags == ['DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN',
                                       'IN', 'CD', 'NNS', 'IN', 'NNS', '.']
        assert annotation.word_senses == [None, 2, 5, None, 2, None, None,
                                          None, None, 1, None, 1, None]
        assert annotation.predicate_framenet_ids == [None, None, '01', None, None, None,
                                                     None, None, None, None, None, '01', None]
        assert annotation.srl_frames == [('rested', ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                                     'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP',
                                                     'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP',
                                                     'I-ARGM-TMP', 'I-ARGM-TMP', 'O']),
                                         ('hearings', ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                                       'O', 'O', 'O', 'B-V', 'O'])]
        assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE',
                                             'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, 'prosecution', 'rest', None, 'case',
                                               None, None, None, None, 'month', None, 'hearing', None]
        assert annotation.speakers == [None, None, None, None, None, None,
                                       None, None, None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP (DT The)  (NN prosecution) )(VP "
                                                        "(VBD rested) (NP (PRP$ its)  (NN case) )"
                                                        "(NP (JJ last)  (NN month) )(PP (IN after) "
                                                        "(NP(NP (CD four)  (NNS months) )(PP (IN"
                                                        " of) (NP (NNS hearings) ))))) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        # Check we can handle sentences without verbs.
        annotation = annotated_sentences[2]
        assert annotation.document_id == 'test/test/03/test_003'
        assert annotation.sentence_id == 0
        assert annotation.words == ['Denise', 'Dillon', 'Headline', 'News', '.']
        assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.']
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, None, None]
        assert annotation.srl_frames == []
        assert annotation.named_entities == ['B-PERSON', 'I-PERSON',
                                             'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O']
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring("(TOP(FRAG(NP (NNP Denise) "
                                                        " (NNP Dillon) )(NP (NNP Headline)  "
                                                        "(NNP News) ) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1))}

        # Check we can handle sentences with 2 identical verbs.
        annotation = annotated_sentences[3]
        assert annotation.document_id == 'test/test/04/test_004'
        assert annotation.sentence_id == 0
        assert annotation.words == ['and', 'that', 'wildness', 'is', 'still', 'in', 'him', ',',
                                    'as', 'it', 'is', 'with', 'all', 'children', '.']
        assert annotation.pos_tags == ['CC', 'DT', 'NN', 'VBZ', 'RB', 'IN', 'PRP', ',',
                                       'IN', 'PRP', 'VBZ', 'IN', 'DT', 'NNS', '.']
        assert annotation.word_senses == [None, None, None, 4.0, None, None, None, None,
                                          None, None, 5.0, None, None, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, '01', None, None,
                                                     None, None, None, None, '01', None, None, None, None]
        assert annotation.srl_frames == [('is', ['B-ARGM-DIS', 'B-ARG1', 'I-ARG1',
                                                 'B-V', 'B-ARGM-TMP', 'B-ARG2', 'I-ARG2',
                                                 'O', 'B-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV',
                                                 'I-ARGM-ADV', 'I-ARGM-ADV', 'I-ARGM-ADV', 'O']),
                                         ('is', ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                                 'B-ARG1', 'B-V', 'B-ARG2', 'I-ARG2', 'I-ARG2', 'O'])]
        assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'O', 'O', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, None, None, 'be', None, None, None,
                                               None, None, None, 'be', None, None, None, None]
        assert annotation.speakers == ['_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_',
                                       '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_',
                                       '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_', '_Avalon_']
        assert annotation.parse_tree == Tree.fromstring("(TOP (S (CC and) (NP (DT that) (NN wildness)) "
                                                        "(VP (VBZ is) (ADVP (RB still)) (PP (IN in) (NP "
                                                        "(PRP him))) (, ,) (SBAR (IN as) (S (NP (PRP it)) "
                                                        "(VP (VBZ is) (PP (IN with) (NP (DT all) (NNS "
                                                        "children))))))) (. .)))")
        assert annotation.coref_spans == {(14, (6, 6))}
示例#32
0
                j_ind = spans[cluster[j]]
                span_pairs.add((j_ind, i_ind))
    return doc_str, spans, span_pairs


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        'convert conll 2012 format into brat format')
    parser.add_argument('--inp', type=str, required=True, help='input dir')
    parser.add_argument('--out', type=str, required=True, help='output dir')
    args = parser.parse_args()

    print('reading coref instances from dataset files at: {}'.format(args.inp))

    avg_cluster_size = []
    ontonotes_reader = Ontonotes()
    for docid, doc in tqdm(
            enumerate(ontonotes_reader.dataset_document_iterator(args.inp))):
        docid += 1
        clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list)

        total_tokens = 0
        for sentence in doc:
            for typed_span in sentence.coref_spans:
                span_id, (start,
                          end) = typed_span  # both start and end are inclusive
                clusters[span_id].append(
                    (start + total_tokens, end + total_tokens))
            total_tokens += len(sentence.words)

        canonical_clusters = canonicalize_clusters(clusters)
    def _read_dataset(self,
                      file_path: str,
                      count_only: bool = False,
                      keep_idx: Optional[Set[int]] = None):
        """
        Yield instances from the file_path.

        Parameters
        ----------
        file_path: str, required
            The path to the data file.
        count_only: bool, optional (default=``False``)
            If True, no instances are returned and instead a dummy object is
            returned. This is useful for quickly counting the number of instances
            in the data file, since creating instances is relatively expensive.
        keep_idx: Set[int], optional (default=``None``)
            If not None, only yield instances whose index is in this set.
        """
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        # Reseed for reproducibility
        self._reseed(seed=self._seed)

        index = 0
        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)

            text_sentences: List[List[str]] = [s.words for s in sentences]
            flattened_text_sentences: List[str] = [
                self._normalize_word(word) for text_sentence in text_sentences
                for word in text_sentence
            ]
            sentence_arc_indices: List[Tuple[int, int]] = []
            sentence_labels: List[str] = []

            # Filter the clusters to only have single-token entities
            # TODO(nfliu): How do we handle spans here?
            filtered_clusters = filter_clusters(canonical_clusters,
                                                max_span_size=1)

            # Check if there are at least two clusters, each of which has at least 2 different items.
            # If not, then skip creating examples from this passage.
            counter = 0
            all_cluster_words = []
            all_cluster_unique_words = []
            for cluster in filtered_clusters:
                # Get the words that show up in the cluster
                cluster_words = list(
                    tuple(flattened_text_sentences[index]
                          for index in range(item[0], item[1] + 1))
                    for item in cluster)
                all_cluster_words.append(cluster_words)

                cluster_unique_words = set(cluster_words)
                all_cluster_unique_words.append(cluster_unique_words)
                if len(set(cluster_words)) >= 2:
                    counter += 1
            if counter < 2:
                continue

            if keep_idx is not None and index not in keep_idx:
                index += 1
                continue
            if count_only:
                yield 1
                continue

            # Contextualize the tokens if a Contextualizer was provided.
            # TODO (nfliu): How can we make this batched?
            # Would make contextualizers that use the GPU much faster.
            if self._contextualizer:
                token_representations = self._contextualizer(
                    [flattened_text_sentences])[0]
            else:
                token_representations = None

            # For each cluster with 2+ different items, make positive examples between each of the different items
            # that are different strings and make negative examples between each of the different items and a
            # random token from another cluster.
            assert ((len(filtered_clusters) == len(all_cluster_words)) &
                    (len(all_cluster_words) == len(all_cluster_unique_words)))

            for cluster_index, (cluster_spans, cluster_words,
                                cluster_unique_words) in enumerate(
                                    zip(filtered_clusters, all_cluster_words,
                                        all_cluster_unique_words)):
                # Don't make examples from this if there is only 1 unique item.
                if len(cluster_unique_words) < 2:
                    continue
                # Get all combinations of cluster spans (a, b), where a occurs
                # in the text before b.
                all_coreferring_spans = []
                for parent_cluster_span in cluster_spans:
                    for child_cluster_span in cluster_spans:
                        # Skip child_cluster_span if it occurs before the parent_span.
                        # TODO (nfliu): this is single-word specific
                        if child_cluster_span[0] < parent_cluster_span[0]:
                            continue

                        # Skip this (child_cluster_span, parent_cluster_span) pair if the words are identical
                        if (flattened_text_sentences[
                                child_cluster_span[0]:child_cluster_span[1] +
                                1] == flattened_text_sentences[
                                    parent_cluster_span[0]:
                                    parent_cluster_span[1] + 1]):
                            continue
                        # Add to the set of coreference candidates
                        all_coreferring_spans.append(
                            (child_cluster_span, parent_cluster_span))

                # Take the coreference_candidates and generate positive and negative examples
                for (child_span, parent_span) in all_coreferring_spans:
                    # TODO (nfliu): This is single-word specific, will have to change
                    # if we generalize to spans
                    sentence_arc_indices.append(
                        (child_span[0], parent_span[0]))
                    sentence_labels.append("1")

                    # Generate a negative example for the child.
                    other_clusters = [
                        cluster for i, cluster in enumerate(filtered_clusters)
                        if i != cluster_index
                    ]
                    negative_coreferent = self._sample_negative_coreferent(
                        other_clusters, child_span[0])
                    if negative_coreferent:
                        sentence_arc_indices.append(
                            (child_span[0], negative_coreferent[0]))
                        sentence_labels.append("0")
            yield self.text_to_instance(
                tokens=flattened_text_sentences,
                arc_indices=sentence_arc_indices,
                token_representations=token_representations,
                labels=sentence_labels)
            index += 1
示例#34
0
    def test_dataset_iterator(self):
        reader = Ontonotes()
        annotated_sentences = list(reader.dataset_iterator('tests/fixtures/conll_2012/'))
        annotation = annotated_sentences[0]
        assert annotation.document_id == "test/test/01/test_001"
        assert annotation.sentence_id == 0
        assert annotation.words == ['Mali', 'government', 'officials', 'say', 'the', 'woman',
                                    "'s", 'confession', 'was', 'forced', '.']
        assert annotation.pos_tags == ['NNP', 'NN', 'NNS', 'VBP', 'DT',
                                       'NN', 'POS', 'NN', 'VBD', 'JJ', '.']
        assert annotation.word_senses == [None, None, 1, 1, None, 2, None, None, 1, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, '01', None,
                                                     None, None, None, '01', None, None]
        assert annotation.srl_frames == {"say": ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                                 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O'],
                                         "was": ['O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1',
                                                 'I-ARG1', 'B-V', 'B-ARG2', 'O']}
        assert annotation.named_entities == ['B-GPE', 'O', 'O', 'O', 'O', 'O',
                                             'O', 'O', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, None, 'official', 'say', None,
                                               'man', None, None, 'be', None, None]
        assert annotation.speakers == [None, None, None, None, None, None,
                                       None, None, None, None, None]

        assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP(NML (NNP Mali)  (NN government) )"
                                                        " (NNS officials) )(VP (VBP say) (SBAR(S(NP(NP"
                                                        " (DT the)  (NN woman)  (POS 's) ) (NN "
                                                        "confession) )(VP (VBD was) (ADJP (JJ "
                                                        "forced) ))))) (. .) ))")
        assert annotation.coref_spans == {(1, (4, 6)), (3, (4, 7))}

        annotation = annotated_sentences[1]
        assert annotation.document_id == "test/test/02/test_002"
        assert annotation.sentence_id == 0
        assert annotation.words == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month',
                                    'after', 'four', 'months', 'of', 'hearings', '.']
        assert annotation.pos_tags == ['DT', 'NN', 'VBD', 'PRP$', 'NN', 'JJ', 'NN',
                                       'IN', 'CD', 'NNS', 'IN', 'NNS', '.']
        assert annotation.word_senses == [None, 2, 5, None, 2, None, None,
                                          None, None, 1, None, 1, None]
        assert annotation.predicate_framenet_ids == [None, None, '01', None, None, None,
                                                     None, None, None, None, None, '01', None]
        assert annotation.srl_frames == {'rested': ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                                    'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP',
                                                    'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP',
                                                    'I-ARGM-TMP', 'I-ARGM-TMP', 'O'],
                                         'hearings': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                                      'O', 'O', 'O', 'B-V', 'O']}
        assert annotation.named_entities == ['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE',
                                             'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O']
        assert annotation.predicate_lemmas == [None, 'prosecution', 'rest', None, 'case',
                                               None, None, None, None, 'month', None, 'hearing', None]
        assert annotation.speakers == [None, None, None, None, None, None,
                                       None, None, None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring("(TOP(S(NP (DT The)  (NN prosecution) )(VP "
                                                        "(VBD rested) (NP (PRP$ its)  (NN case) )"
                                                        "(NP (JJ last)  (NN month) )(PP (IN after) "
                                                        "(NP(NP (CD four)  (NNS months) )(PP (IN"
                                                        " of) (NP (NNS hearings) ))))) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1)), (2, (3, 3))}

        annotation = annotated_sentences[2]
        assert annotation.document_id == 'test/test/03/test_003'
        assert annotation.sentence_id == 0
        assert annotation.words == ['Denise', 'Dillon', 'Headline', 'News', '.']
        assert annotation.pos_tags == ['NNP', 'NNP', 'NNP', 'NNP', '.']
        assert annotation.word_senses == [None, None, None, None, None]
        assert annotation.predicate_framenet_ids == [None, None, None, None, None]
        assert annotation.srl_frames == {}
        assert annotation.named_entities == ['B-PERSON', 'I-PERSON',
                                             'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O']
        assert annotation.predicate_lemmas == [None, None, None, None, None]
        assert annotation.speakers == [None, None, None, None, None]
        assert annotation.parse_tree == Tree.fromstring("(TOP(FRAG(NP (NNP Denise) "
                                                        " (NNP Dillon) )(NP (NNP Headline)  "
                                                        "(NNP News) ) (. .) ))")
        assert annotation.coref_spans == {(2, (0, 1))}
示例#35
0
                brat_span_pairs[(predicate, arg_key)] = arg_label

    return ' '.join(tokens), brat_spans, brat_span_pairs


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        'convert conll 2012 format into brat format')
    parser.add_argument('--inp', type=str, required=True, help='input dir')
    parser.add_argument('--out', type=str, required=True, help='output dir')
    parser.add_argument('--merge',
                        action='store_true',
                        help='merge adjacent same sentences')
    args = parser.parse_args()

    ontonotes_reader = Ontonotes()

    print(
        'reading OpenIE instances from dataset files at: {}. The same sentences must be successive'
        .format(args.inp))

    def doc_iter(
        n_sent
    ):  # treat every n_sent sentence as a document for OpenIE to reduce the number of files
        doc: List[OntonotesSentence] = []
        for conll_file in ontonotes_reader.dataset_path_iterator(args.inp):
            for sent in ontonotes_reader.sentence_iterator(conll_file):
                same_as_last = False
                if args.merge and len(doc) > 0 and ' '.join(
                        sent.words) == ' '.join(doc[-1].words):
                    same_as_last = True