Exemplo n.º 1
0
    def test_read_from_file(self, lazy):
        conll_reader = ConllCorefReader(max_span_width=self.span_width, lazy=lazy)
        instances = ensure_list(conll_reader.read(str(AllenNlpTestCase.FIXTURES_ROOT /
                                                      'coref' / 'coref.gold_conll')))

        assert len(instances) == 2

        fields = instances[0].fields
        text = [x.text for x in fields["text"].tokens]

        assert text == ['In', 'the', 'summer', 'of', '2005', ',', 'a', 'picture', 'that',
                        'people', 'have', 'long', 'been', 'looking', 'forward', 'to',
                        'started', 'emerging', 'with', 'frequency', 'in', 'various', 'major',
                        'Hong', 'Kong', 'media', '.', 'With', 'their', 'unique', 'charm', ',',
                        'these', 'well', '-', 'known', 'cartoon', 'images', 'once', 'again',
                        'caused', 'Hong', 'Kong', 'to', 'be', 'a', 'focus', 'of', 'worldwide',
                        'attention', '.', 'The', 'world', "'s", 'fifth', 'Disney', 'park',
                        'will', 'soon', 'open', 'to', 'the', 'public', 'here', '.']

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])

        candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x)
                                                               for i, x in gold_indices_with_ids]

        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        gold_mentions_with_ids.remove((["Hong", "Kong"], 0))
        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        assert (["their"], 1) in gold_mentions_with_ids
        # This is a span which exceeds our max_span_width, so it should not be considered.
        assert not (["these", "well", "known", "cartoon", "images"], 1) in gold_mentions_with_ids

        fields = instances[1].fields
        text = [x.text for x in fields["text"].tokens]
        assert text == ['The', 'area', 'of', 'Hong', 'Kong', 'is', 'only', 'one', 'thousand', '-', 'plus',
                        'square', 'kilometers', '.', 'The', 'population', 'is', 'dense', '.', 'Natural',
                        'resources', 'are', 'relatively', 'scarce', '.', 'However', ',', 'the', 'clever',
                        'Hong', 'Kong', 'people', 'will', 'utilize', 'all', 'resources', 'they', 'have',
                        'created', 'for', 'developing', 'the', 'Hong', 'Kong', 'tourism', 'industry', '.']

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])

        candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x)
                                                               for i, x in gold_indices_with_ids]

        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        gold_mentions_with_ids.remove((["Hong", "Kong"], 0))
        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        assert (["they"], 1) in gold_mentions_with_ids
        assert (['the', 'clever', 'Hong', 'Kong', 'people'], 1) in gold_mentions_with_ids
    def test_read_from_file(self, lazy):
        conll_reader = ConllCorefReader(max_span_width=self.span_width, lazy=lazy)
        instances = ensure_list(conll_reader.read('tests/fixtures/coref/coref.gold_conll'))

        assert len(instances) == 2

        fields = instances[0].fields
        text = [x.text for x in fields["text"].tokens]

        assert text == ['In', 'the', 'summer', 'of', '2005', ',', 'a', 'picture', 'that',
                        'people', 'have', 'long', 'been', 'looking', 'forward', 'to',
                        'started', 'emerging', 'with', 'frequency', 'in', 'various', 'major',
                        'Hong', 'Kong', 'media', '.', 'With', 'their', 'unique', 'charm', ',',
                        'these', 'well', '-', 'known', 'cartoon', 'images', 'once', 'again',
                        'caused', 'Hong', 'Kong', 'to', 'be', 'a', 'focus', 'of', 'worldwide',
                        'attention', '.', 'The', 'world', "'s", 'fifth', 'Disney', 'park',
                        'will', 'soon', 'open', 'to', 'the', 'public', 'here', '.']

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])

        candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x)
                                                               for i, x in gold_indices_with_ids]

        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        gold_mentions_with_ids.remove((["Hong", "Kong"], 0))
        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        assert (["their"], 1) in gold_mentions_with_ids
        # This is a span which exceeds our max_span_width, so it should not be considered.
        assert not (["these", "well", "known", "cartoon", "images"], 1) in gold_mentions_with_ids

        fields = instances[1].fields
        text = [x.text for x in fields["text"].tokens]
        assert text == ['The', 'area', 'of', 'Hong', 'Kong', 'is', 'only', 'one', 'thousand', '-', 'plus',
                        'square', 'kilometers', '.', 'The', 'population', 'is', 'dense', '.', 'Natural',
                        'resources', 'are', 'relatively', 'scarce', '.', 'However', ',', 'the', 'clever',
                        'Hong', 'Kong', 'people', 'will', 'utilize', 'all', 'resources', 'they', 'have',
                        'created', 'for', 'developing', 'the', 'Hong', 'Kong', 'tourism', 'industry', '.']

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])

        candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x)
                                                               for i, x in gold_indices_with_ids]

        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        gold_mentions_with_ids.remove((["Hong", "Kong"], 0))
        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        assert (["they"], 1) in gold_mentions_with_ids
        assert (['the', 'clever', 'Hong', 'Kong', 'people'], 1) in gold_mentions_with_ids
Exemplo n.º 3
0
    def test_max_sentences(self):
        conll_reader = ConllCorefReader(max_span_width=self.span_width)
        instances = ensure_list(
            conll_reader.read(
                str(AllenNlpTestCase.FIXTURES_ROOT / "coref" /
                    "coref.gold_conll")))

        limited_conll_reader = ConllCorefReader(max_span_width=self.span_width,
                                                max_sentences=2)
        limited_instances = ensure_list(
            limited_conll_reader.read(
                str(AllenNlpTestCase.FIXTURES_ROOT / "coref" /
                    "coref.gold_conll")))

        assert len(limited_instances) == len(instances) == 4

        tokens_of = lambda instance: instance.fields["text"].tokens
        text_of = lambda tokens: [token.text for token in tokens]
        docs = [tokens_of(instance) for instance in instances]
        limited_docs = [tokens_of(instance) for instance in limited_instances]

        # Short ones; not truncated
        assert limited_docs[1] == docs[1]
        assert limited_docs[3] == docs[3]

        # Truncation happened
        assert len(limited_docs[0]) < len(docs[0])
        assert len(limited_docs[2]) < len(docs[2])
        assert "Disney" in text_of(docs[0]) and "Disney" not in text_of(
            limited_docs[0])
        assert "tourism" in text_of(docs[2]) and "tourism" not in text_of(
            limited_docs[2])

        # Truncated tokens are the prefixes
        assert limited_docs[0] == docs[0][:len(limited_docs[0])]
        assert limited_docs[2] == docs[2][:len(limited_docs[2])]
Exemplo n.º 4
0
    def test_read_from_file(self):

        conll_reader = ConllCorefReader(max_span_width=self.span_width)
        dataset = conll_reader.read('tests/fixtures/conll_2012/')

        assert len(dataset.instances) == 1

        instances = dataset.instances
        fields = instances[0].fields
        text = [x.text for x in fields["text"].tokens]

        assert text == [
            'Mali', 'government', 'officials', 'say', 'the', 'woman', "'s",
            'confession', 'was', 'forced', '.', 'The', 'prosecution', 'rested',
            'its', 'case', 'last', 'month', 'after', 'four', 'months', 'of',
            'hearings', '.', 'Denise', 'Dillon', 'Headline', 'News', '.'
        ]

        span_starts = fields["span_starts"].field_list
        span_ends = fields["span_ends"].field_list

        candidate_mentions = self.check_candidate_mentions_are_well_defined(
            span_starts, span_ends, text)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x)
                                 for i, x in enumerate(gold_span_labels.labels)
                                 if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [
            (candidate_mentions[i], x) for i, x in gold_indices_with_ids
        ]

        assert gold_mentions_with_ids == [
            (['the', 'woman', "'s"], 0),
            (['the', 'woman', "'s", 'confession'], 1),
            (['The', 'prosecution'], 2), (['its'], 2), (['Denise',
                                                         'Dillon'], 2)
        ]

        # Now check that we don't collect spans greater than the max width.
        conll_reader = ConllCorefReader(max_span_width=2)
        dataset = conll_reader.read('tests/fixtures/conll_2012/')

        instances = dataset.instances
        fields = instances[0].fields
        text = [x.text for x in fields["text"].tokens]
        span_starts = fields["span_starts"].field_list
        span_ends = fields["span_ends"].field_list

        candidate_mentions = self.check_candidate_mentions_are_well_defined(
            span_starts, span_ends, text)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x)
                                 for i, x in enumerate(gold_span_labels.labels)
                                 if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [
            (candidate_mentions[i], x) for i, x in gold_indices_with_ids
        ]

        assert gold_mentions_with_ids == [(['The', 'prosecution'], 2),
                                          (['its'], 2),
                                          (['Denise', 'Dillon'], 2)]
Exemplo n.º 5
0
    def test_wordpiece_modeling(self):
        tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
        conll_reader = ConllCorefReader(
            max_span_width=self.span_width, wordpiece_modeling_tokenizer=tokenizer
        )
        instances = ensure_list(
            conll_reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / "coref" / "coref.gold_conll"))
        )

        assert len(instances) == 4

        fields = instances[3].fields
        text = [x.text for x in fields["text"].tokens]

        assert text == [
            "[CLS]",
            "Hong",
            "Kong",
            "Wet",
            "##land",
            "Park",
            ",",
            "which",
            "is",
            "currently",
            "under",
            "construction",
            ",",
            "is",
            "also",
            "one",
            "of",
            "the",
            "designated",
            "new",
            "projects",
            "of",
            "the",
            "Hong",
            "Kong",
            "SA",
            "##R",
            "government",
            "for",
            "advancing",
            "the",
            "Hong",
            "Kong",
            "tourism",
            "industry",
            ".",
            "[SEP]",
        ]

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])

        candidate_mentions = self.check_candidate_mentions_are_well_defined(
            span_starts, span_ends, text
        )

        # Asserts special tokens aren't included in the spans
        assert all(span_start > 0 for span_start in span_starts)
        assert all(span_end < len(text) - 1 for span_end in span_ends)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [
            (candidate_mentions[i], x) for i, x in gold_indices_with_ids
        ]

        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        # Within span_width before wordpiece splitting but exceeds afterwards
        assert (["the", "Hong", "Kong", "SA", "##R", "government"], 0) not in gold_mentions_with_ids

        fields = instances[1].fields
        text = [x.text for x in fields["text"].tokens]
        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])
        candidate_mentions = self.check_candidate_mentions_are_well_defined(
            span_starts, span_ends, text
        )

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [
            (candidate_mentions[i], x) for i, x in gold_indices_with_ids
        ]

        # Prior to wordpiece tokenization, 's was one token; wordpiece tokenization splits it into 2
        assert (["the", "city", "'", "s"], 0) in gold_mentions_with_ids
Exemplo n.º 6
0
    def test_read_from_file(self, lazy):
        conll_reader = ConllCorefReader(max_span_width=self.span_width, lazy=lazy)
        instances = ensure_list(
            conll_reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / "coref" / "coref.gold_conll"))
        )

        assert len(instances) == 4

        fields = instances[0].fields
        text = [x.text for x in fields["text"].tokens]

        assert text == [
            "In",
            "the",
            "summer",
            "of",
            "2005",
            ",",
            "a",
            "picture",
            "that",
            "people",
            "have",
            "long",
            "been",
            "looking",
            "forward",
            "to",
            "started",
            "emerging",
            "with",
            "frequency",
            "in",
            "various",
            "major",
            "Hong",
            "Kong",
            "media",
            ".",
            "With",
            "their",
            "unique",
            "charm",
            ",",
            "these",
            "well",
            "-",
            "known",
            "cartoon",
            "images",
            "once",
            "again",
            "caused",
            "Hong",
            "Kong",
            "to",
            "be",
            "a",
            "focus",
            "of",
            "worldwide",
            "attention",
            ".",
            "The",
            "world",
            "'s",
            "fifth",
            "Disney",
            "park",
            "will",
            "soon",
            "open",
            "to",
            "the",
            "public",
            "here",
            ".",
        ]

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])

        candidate_mentions = self.check_candidate_mentions_are_well_defined(
            span_starts, span_ends, text
        )

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [
            (candidate_mentions[i], x) for i, x in gold_indices_with_ids
        ]

        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        gold_mentions_with_ids.remove((["Hong", "Kong"], 0))
        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        assert (["their"], 1) in gold_mentions_with_ids
        # This is a span which exceeds our max_span_width, so it should not be considered.
        assert (
            ["these", "well", "-", "known", "cartoon", "images"],
            1,
        ) not in gold_mentions_with_ids

        fields = instances[2].fields
        text = [x.text for x in fields["text"].tokens]
        assert text == [
            "The",
            "area",
            "of",
            "Hong",
            "Kong",
            "is",
            "only",
            "one",
            "thousand",
            "-",
            "plus",
            "square",
            "kilometers",
            ".",
            "The",
            "population",
            "is",
            "dense",
            ".",
            "Natural",
            "resources",
            "are",
            "relatively",
            "scarce",
            ".",
            "However",
            ",",
            "the",
            "clever",
            "Hong",
            "Kong",
            "people",
            "will",
            "utilize",
            "all",
            "resources",
            "they",
            "have",
            "created",
            "for",
            "developing",
            "the",
            "Hong",
            "Kong",
            "tourism",
            "industry",
            ".",
        ]

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])

        candidate_mentions = self.check_candidate_mentions_are_well_defined(
            span_starts, span_ends, text
        )

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [
            (candidate_mentions[i], x) for i, x in gold_indices_with_ids
        ]

        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        gold_mentions_with_ids.remove((["Hong", "Kong"], 0))
        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        assert (["they"], 1) in gold_mentions_with_ids
        assert (["the", "clever", "Hong", "Kong", "people"], 1) in gold_mentions_with_ids