示例#1
0
    def test_spacy_tokenizer(self, lang: str):
        if lang == 'nn':
            with pytest.raises(ValueError):
                spacy_tok = spacy_tokenizer(lang=lang)
        else:
            spacy_tok = spacy_tokenizer(lang=lang)

            emoji_tokens = spacy_tok(self._emoji_sentence())
            if lang == 'en':
                assert emoji_tokens == [
                    'Hello', 'how', 'are', 'you', ',', 'with', "other", "'s",
                    ':)'
                ]
            else:
                assert emoji_tokens == [
                    'Hello', 'how', 'are', 'you', ',', 'with', "other's", ':)'
                ]

            no_sentence_tokens = spacy_tok(self._no_sentence())
            assert no_sentence_tokens == []

            whitespace_tokens = spacy_tok(self._whitespace_sentence())
            assert whitespace_tokens == ['another', 'day', 'is', 'today']

            more_whitespace_tokens = spacy_tok(self._serveral_whitespace())
            assert more_whitespace_tokens == ['another', 'day', 'is', 'today']

            comma_tokens = spacy_tok(self._comma_sentence())
            assert comma_tokens == [
                'today', 'Is', 'a', 'great', ',', 'day', 'I', 'think'
            ]
示例#2
0
    def test_sequence_labels(self):
        # Test the single case
        test_collection = TargetTextCollection([self._target_text_example()])
        test_collection.tokenize(spacy_tokenizer())
        test_collection.sequence_labels()
        correct_sequence = ['O', 'B', 'I', 'O', 'O', 'O', 'B', 'O', 'O']
        assert test_collection['2']['sequence_labels'] == correct_sequence

        # Test the multiple case
        test_collection = TargetTextCollection(self._target_text_examples())
        test_collection.tokenize(spacy_tokenizer())
        test_collection.sequence_labels()
        correct_sequence = ['O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O']
        assert test_collection['another_id'][
            'sequence_labels'] == correct_sequence
示例#3
0
def dataset_target_sentiment_statistics(collections: List[TargetTextCollection],
                                        lower_target: bool = True,
                                        target_key: str = 'targets',
                                        tokeniser: Callable[[str], List[str]]=spacy_tokenizer(),
                                        sentiment_key: str = 'target_sentiments',
                                        dataframe_format: bool = False,
                                        incl_sentence_statistics: bool = True
                                        ) -> Union[List[Dict[str, Union[str,int,float]]], 
                                                   pd.DataFrame]:
    '''
    :param collections: A list of collections
    :param lower_target: Whether to lower case the targets before counting them
    :param target_key: The key within each sample in each collection that contains 
                       the list of targets to be analysed. This can also be the 
                       predicted target key, which might be useful for error 
                       analysis.
    :param tokenizer: The tokenizer to use to split the target(s) into tokens. See 
                      for a module of comptabile tokenisers 
                      :py:mod:`target_extraction.tokenizers`. This is required 
                      to give statistics on target length.
    :param sentiment_key: The key in each TargetText within each collection that 
                          contains the True sentiment value.
    :param dataframe_format: If True instead of a list of dictionaries the 
                             return will be a pandas dataframe
    :param incl_sentence_statistics: If False statistics about the sentence
                                     will not be included. This is so that 
                                     the statistics can still be created for 
                                     datasets that have been anonymised.
    :returns: A list of dictionaries each containing the statistics for the 
              associated collection. Each dictionary will have the keys from 
              :py:func:`dataset_target_extraction_statistics` and the following 
              in addition:
              1. POS (%) -- Number (Percentage) of positive targets
              2. NEU (%) -- Number (Percentage) of neutral targets
              3. NEG (%) -- Number (Percentage) of Negative targets
    '''
    initial_dataset_stats = dataset_target_extraction_statistics(collections, 
                                                                 lower_target=lower_target, 
                                                                 target_key=target_key, 
                                                                 tokeniser=tokeniser,
                                                                 dataframe_format=False,
                                                                 incl_sentence_statistics=incl_sentence_statistics)
    dataset_stats = []
    for collection, collection_stats in zip(collections, initial_dataset_stats):
        sentiment_percent = get_sentiment_counts(collection, normalised=True, 
                                                 sentiment_key=sentiment_key)
        sentiment_percent = {sentiment_name: round((fraction * 100), 2) 
                             for sentiment_name, fraction in sentiment_percent.items()}
        sentiment_count = get_sentiment_counts(collection, normalised=False, 
                                               sentiment_key=sentiment_key)
        pos_value = f'{sentiment_count["positive"]} ({sentiment_percent["positive"]})'
        collection_stats['POS (%)'] = pos_value 
        neu_value = f'{sentiment_count["neutral"]} ({sentiment_percent["neutral"]})'
        collection_stats['NEU (%)'] = neu_value
        neg_value = f'{sentiment_count["negative"]} ({sentiment_percent["negative"]})'
        collection_stats['NEG (%)'] = neg_value
        dataset_stats.append(collection_stats)
    if dataframe_format:
        return _statistics_to_dataframe(dataset_stats)
    return dataset_stats
示例#4
0
    def test_tokenize(self):
        # Test the normal case with one TargetText Instance in the collection
        test_collection = TargetTextCollection([self._target_text_example()])
        test_collection.tokenize(str.split)
        tokenized_answer = [
            'The', 'laptop', 'case', 'was', 'great', 'and', 'cover', 'was',
            'rubbish'
        ]
        test_collection['2']['tokenized_text'] = tokenized_answer

        # Test the normal case with multiple TargetText Instance in the
        # collection
        test_collection = TargetTextCollection(self._target_text_examples())
        test_collection.tokenize(spacy_tokenizer())
        test_collection['2']['tokenized_text'] = tokenized_answer
示例#5
0
def multi_word_targets(targets: List[str],
                       lower: bool = True,
                       string_delimiter: str = '_') -> Dict[str, str]:
    '''
    :param targets: A list of targets where multi word targets will have 
                    there whitespace replaced with `_` to create a single 
                    word target. Spacy tokenizer determines multi word targets. 
                    The tokenisation happens before lower casing the target
                    when applicable. Furthermore any target when tokenised 
                    is the same as another the later targets are not included 
                    to avoid one target to multiple multi word target mappings.
    :param lower: if to lower case the target words.
    :param string_delimiter: The string to be used to join the target words 
                             together after they have been tokenised by the 
                             spacy tokeniser.
    :returns: A dictionary of the original target and their multi words targets
              whitespace replacement version where the whitepsace is replaced 
              with `_` e.g. {`tesco supermarket`: `tesco_supermarket`}
    '''
    tokenizer = spacy_tokenizer()
    target_mapper = {}
    unique_targets = set()
    tokenized_targets = set()
    for target in targets:
        # This is done to avoid targets that are different until they are
        # tokenized.
        tokenized_target = tokenizer(target)
        tokenized_target = string_delimiter.join(tokenized_target)
        if lower:
            tokenized_target = tokenized_target.lower()
        if tokenized_target in tokenized_targets:
            continue
        tokenized_targets.add(tokenized_target)
        if lower:
            target = target.lower()
        unique_targets.add(target)
        target_mapper[target] = tokenized_target
    assert_err = 'The length of the multi word targets is not the same '\
                 'as the non-multi-word targets'
    assert len(unique_targets) == len(target_mapper), assert_err
    return target_mapper
示例#6
0
        temp_election_directory = Path('.', 'data', 'twitter_election_dataset')
        train_data = wang_2017_election_twitter_train(temp_election_directory)
        test_data = wang_2017_election_twitter_test(temp_election_directory)

    if not args.model_save_dir.is_dir():
        # Use the same size validation as the test data
        test_size = len(test_data)
        # Create the train and validation splits
        train_data = list(train_data.values())
        train_data, val_data = train_test_split(train_data,
                                                test_size=test_size)
        train_data = TargetTextCollection(train_data)
        val_data = TargetTextCollection(val_data)
        # Tokenize the data
        datasets = [train_data, val_data, test_data]
        tokenizer = spacy_tokenizer()

        sizes = []
        target_sizes = []
        for dataset in datasets:
            dataset.tokenize(tokenizer)
            returned_errors = dataset.sequence_labels(return_errors=True)
            if returned_errors:
                for error in returned_errors:
                    error_id = error['text_id']
                    del dataset[error_id]
            returned_errors = dataset.sequence_labels(return_errors=True)
            if returned_errors:
                raise ValueError('Sequence label errors are still persisting')
            sizes.append(len(dataset))
            dataset: TargetTextCollection
示例#7
0
class TestTokenizers:
    def _emoji_sentence(self) -> str:
        return "Hello how are you, with other's :)"

    def _no_sentence(self) -> str:
        return ''

    def _whitespace_sentence(self) -> str:
        return 'another day is today'

    def _serveral_whitespace(self) -> str:
        return '   another    day is today   '

    def _comma_sentence(self) -> str:
        return 'today Is a great, day I think'

    def _difficult_tokenizer_sentence(self) -> str:
        return "But guess what?  (you have to buy an external dvd drive."

    def not_char_preserving_tokenizer(self, text: str) -> List[str]:
        tokens = text.split()
        alt_tokens = []
        for token in tokens:
            if token == "other's":
                alt_tokens.append('other')
            else:
                alt_tokens.append(token)
        return alt_tokens

    # This is bad coding pracice but the str.split with False value in the
    # actual method we replace str.split with not_char_preserving_tokenizer
    @pytest.mark.parametrize("tokenizer_pass",
                             ((whitespace(), True), (spacy_tokenizer(), True),
                              (ark_twokenize(), True), (stanford(), True),
                              (str.split, False)))
    def test_is_character_preserving(self,
                                     tokenizer_pass: Tuple[Callable[[str],
                                                                    List[str]],
                                                           bool]):
        tokenizer, pass_or_not = tokenizer_pass
        sentence = self._emoji_sentence()
        tokens = tokenizer(sentence)
        if not pass_or_not:
            tokens = self.not_char_preserving_tokenizer(sentence)
        assert is_character_preserving(sentence, tokens) == pass_or_not

        if pass_or_not:
            sentence = self._difficult_tokenizer_sentence()
            tokens = tokenizer(sentence)
            assert is_character_preserving(sentence, tokens) == True

    def test_whitespace(self):
        whitespace_tokenizer = whitespace()

        emoji_tokens = whitespace_tokenizer(self._emoji_sentence())
        assert emoji_tokens == [
            'Hello', 'how', 'are', 'you,', 'with', "other's", ':)'
        ]

        no_sentence_tokens = whitespace_tokenizer(self._no_sentence())
        assert no_sentence_tokens == []

        whitespace_tokens = whitespace_tokenizer(self._whitespace_sentence())
        assert whitespace_tokens == ['another', 'day', 'is', 'today']

        comma_tokens = whitespace_tokenizer(self._comma_sentence())
        assert comma_tokens == [
            'today', 'Is', 'a', 'great,', 'day', 'I', 'think'
        ]

        more_whitespace_tokens = whitespace_tokenizer(
            self._serveral_whitespace())
        assert more_whitespace_tokens == ['another', 'day', 'is', 'today']

    def test_ark_twokenizer(self):
        tokenizer = ark_twokenize()

        emoji_tokens = tokenizer(self._emoji_sentence())
        assert emoji_tokens == [
            'Hello', 'how', 'are', 'you', ',', 'with', "other's", ':)'
        ]

        no_sentence_tokens = tokenizer(self._no_sentence())
        assert no_sentence_tokens == []

        whitespace_tokens = tokenizer(self._whitespace_sentence())
        assert whitespace_tokens == ['another', 'day', 'is', 'today']

        comma_tokens = tokenizer(self._comma_sentence())
        assert comma_tokens == [
            'today', 'Is', 'a', 'great', ',', 'day', 'I', 'think'
        ]

        more_whitespace_tokens = tokenizer(self._serveral_whitespace())
        assert more_whitespace_tokens == ['another', 'day', 'is', 'today']

    @pytest.mark.parametrize("lang", ('en', 'de', 'nn'))
    def test_spacy_tokenizer(self, lang: str):
        if lang == 'nn':
            with pytest.raises(ValueError):
                spacy_tok = spacy_tokenizer(lang=lang)
        else:
            spacy_tok = spacy_tokenizer(lang=lang)

            emoji_tokens = spacy_tok(self._emoji_sentence())
            if lang == 'en':
                assert emoji_tokens == [
                    'Hello', 'how', 'are', 'you', ',', 'with', "other", "'s",
                    ':)'
                ]
            else:
                assert emoji_tokens == [
                    'Hello', 'how', 'are', 'you', ',', 'with', "other's", ':)'
                ]

            no_sentence_tokens = spacy_tok(self._no_sentence())
            assert no_sentence_tokens == []

            whitespace_tokens = spacy_tok(self._whitespace_sentence())
            assert whitespace_tokens == ['another', 'day', 'is', 'today']

            more_whitespace_tokens = spacy_tok(self._serveral_whitespace())
            assert more_whitespace_tokens == ['another', 'day', 'is', 'today']

            comma_tokens = spacy_tok(self._comma_sentence())
            assert comma_tokens == [
                'today', 'Is', 'a', 'great', ',', 'day', 'I', 'think'
            ]

    @pytest.mark.parametrize("lang", ('en', 'de'))
    @pytest.mark.parametrize("treebank", (None, 'ewt', 'gum'))
    def test_stanford_tokenizer(self, lang: str, treebank: str):
        '''
        This does not really currently test if the treebanks perform as they 
        should i.e. we do not currently test that the English EWT treebank
        tokeniser is any different to the Enlgish GUM tokeniser.
        '''
        if treebank is not None and lang == 'de':
            pass
        else:
            tokenizer = stanford(lang=lang, treebank=treebank)

            emoji_tokens = tokenizer(self._emoji_sentence())
            emoji_ans = [
                'Hello', 'how', 'are', 'you', ',', 'with', "other", "'s", ':)'
            ]
            if lang == 'de':
                emoji_ans = [
                    'Hello', 'how', 'are', 'you', ',', 'with', "other", "'s",
                    ':', ')'
                ]
            assert emoji_tokens == emoji_ans

            no_sentence_tokens = tokenizer(self._no_sentence())
            assert no_sentence_tokens == []

            whitespace_tokens = tokenizer(self._whitespace_sentence())
            assert whitespace_tokens == ['another', 'day', 'is', 'today']

            more_whitespace_tokens = tokenizer(self._serveral_whitespace())
            assert more_whitespace_tokens == ['another', 'day', 'is', 'today']

            comma_tokens = tokenizer(self._comma_sentence())
            assert comma_tokens == [
                'today', 'Is', 'a', 'great', ',', 'day', 'I', 'think'
            ]

    @pytest.mark.parametrize(
        "tokenizer",
        (whitespace(), spacy_tokenizer(), stanford(), ark_twokenize()))
    def test_token_index_alignment(self, tokenizer: Callable[[str],
                                                             List[str]]):
        # Test a sentence where whitespace will be the only factor
        text = self._whitespace_sentence()
        token_indexs = [(0, 7), (8, 11), (12, 14), (15, 20)]
        assert token_indexs == token_index_alignment(text, tokenizer(text))

        # Test a sentence where we have a comma which will cause extra
        # whitespace on the tokenization side
        text = self._comma_sentence()
        token_indexs = [(0, 5), (6, 8), (9, 10), (11, 16), (16, 17), (18, 21),
                        (22, 23), (24, 29)]
        if tokenizer != whitespace():
            assert token_indexs == token_index_alignment(text, tokenizer(text))
        else:
            token_indexs = [(0, 5), (6, 8), (9, 10), (11, 17), (18, 21),
                            (22, 23), (24, 29)]
            assert token_indexs == token_index_alignment(text, tokenizer(text))

        # Test a sentence where we have multiple spaces in the text at the
        # start, end and in between tokens
        text = '  I had,   great day  '
        token_indexs = [(2, 3), (4, 7), (7, 8), (11, 16), (17, 20)]
        if tokenizer != whitespace():
            assert token_indexs == token_index_alignment(text, tokenizer(text))
        else:
            token_indexs = [(2, 3), (4, 8), (11, 16), (17, 20)]
            assert token_indexs == token_index_alignment(text, tokenizer(text))

        # Test a sentence that has multiple space commas hyphens etc.
        text = "  I had,  isn't  great day  doesn't'"
        token_indexs = [(2, 3), (4, 7), (7, 8), (10, 12), (12, 15), (17, 22),
                        (23, 26), (28, 32), (32, 35), (35, 36)]
        if tokenizer != whitespace() and tokenizer != ark_twokenize():
            assert token_indexs == token_index_alignment(text, tokenizer(text))
        elif tokenizer == ark_twokenize():
            token_indexs = [(2, 3), (4, 7), (7, 8), (10, 15), (17, 22),
                            (23, 26), (28, 35), (35, 36)]
            assert token_indexs == token_index_alignment(text, tokenizer(text))
        else:
            token_indexs = [(2, 3), (4, 8), (10, 15), (17, 22), (23, 26),
                            (28, 36)]
            assert token_indexs == token_index_alignment(text, tokenizer(text))
sentiment_data_dir = Path('.', 'data', 'main_task', 'en')
laptop_data_dir = Path(sentiment_data_dir, 'laptop')
restaurant_data_dir = Path(sentiment_data_dir, 'restaurant')

common_file_names = ['train.conll', 'dev.conll', 'test.conll']
data_dir_urls = [(restaurant_data_dir, restaurant_urls),
                 (laptop_data_dir, laptop_urls)]
for data_dir, urls in data_dir_urls:
    for url, file_name in zip(urls, common_file_names):
        downloaded_fp = cached_path(url)
        new_fp = Path(data_dir, file_name)
        new_fp.parent.mkdir(parents=True, exist_ok=True)
        utils.from_biose_to_bioul(Path(downloaded_fp), new_fp)

mams_data_dir = Path(sentiment_data_dir, 'MAMS')
mams_data_dir.mkdir(parents=True, exist_ok=True)
split_names = ['train', 'val', 'test']
for split_name, file_name in zip(split_names, common_file_names):
    if split_name == 'train':
        collection = multi_aspect_multi_sentiment_atsa(split_name,
                                                       original=False)
    else:
        collection = multi_aspect_multi_sentiment_atsa(split_name)
    collection.tokenize(spacy_tokenizer())
    collection.sequence_labels(label_key='target_sentiments')
    conll_fp = Path(mams_data_dir, file_name)
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_fp = Path(temp_dir, 'temp_file.conll')
        collection.to_conll_file(temp_fp, gold_label_key='sequence_labels')
        utils.from_bio_to_bioul(temp_fp, conll_fp)
示例#9
0
            temp_election_directory = Path('/tmp/election_dataset_dir')
            train_data = wang_2017_election_twitter_train(
                temp_election_directory)
            test_data = wang_2017_election_twitter_test(
                temp_election_directory)
        # Use the same size validation as the test data
        test_size = len(test_data)
        # Create the train and validation splits
        train_data = list(train_data.values())
        train_data, val_data = train_test_split(train_data,
                                                test_size=test_size)
        train_data = TargetTextCollection(train_data)
        val_data = TargetTextCollection(val_data)
        # Tokenize the data
        datasets = [train_data, val_data, test_data]
        tokenizer = spacy_tokenizer()

        sizes = []
        for dataset in datasets:
            dataset.tokenize(tokenizer)
            returned_errors = dataset.sequence_labels(return_errors=True)
            if returned_errors:
                for error in returned_errors:
                    error_id = error['text_id']
                    del dataset[error_id]
            returned_errors = dataset.sequence_labels(return_errors=True)
            if returned_errors:
                raise ValueError('Sequence label errors are still persisting')
            sizes.append(len(dataset))
        print(
            f'Lengths Train: {sizes[0]}, Validation: {sizes[1]}, Test: {sizes[2]}'
示例#10
0
    def test_exact_match_score(self):
        # Simple case where it should get perfect score
        test_collection = TargetTextCollection([self._target_text_example()])
        test_collection.tokenize(spacy_tokenizer())
        test_collection.sequence_labels()
        measures = test_collection.exact_match_score('sequence_labels')
        for index, measure in enumerate(measures):
            if index == 3:
                assert measure['FP'] == []
                assert measure['FN'] == []
                assert measure['TP'] == [('2', Span(4, 15)),
                                         ('2', Span(30, 35))]
            else:
                assert measure == 1.0

        # Something that has perfect precision but misses one therefore does
        # not have perfect recall nor f1
        test_collection = TargetTextCollection(
            self._target_text_measure_examples())
        test_collection.tokenize(str.split)
        # text = 'The laptop case was great and cover was rubbish'
        sequence_labels_0 = ['O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O']
        test_collection['0']['sequence_labels'] = sequence_labels_0
        # text = 'The laptop price was awful'
        sequence_labels_1 = ['O', 'B', 'I', 'O', 'O']
        test_collection['1']['sequence_labels'] = sequence_labels_1
        recall, precision, f1, error_analysis = test_collection.exact_match_score(
            'sequence_labels')
        assert precision == 1.0
        assert recall == 2.0 / 3.0
        assert f1 == 0.8
        assert error_analysis['FP'] == []
        assert error_analysis['FN'] == [('0', Span(4, 15))]
        assert error_analysis['TP'] == [('0', Span(30, 35)), ('1', Span(4,
                                                                        16))]

        # Something that has perfect recall but not precision as it over
        # predicts
        sequence_labels_0 = ['O', 'B', 'I', 'B', 'O', 'O', 'B', 'O', 'O']
        test_collection['0']['sequence_labels'] = sequence_labels_0
        sequence_labels_1 = ['O', 'B', 'I', 'O', 'O']
        test_collection['1']['sequence_labels'] = sequence_labels_1
        recall, precision, f1, error_analysis = test_collection.exact_match_score(
            'sequence_labels')
        assert precision == 3 / 4
        assert recall == 1.0
        assert round(f1, 3) == 0.857
        assert error_analysis['FP'] == [('0', Span(16, 19))]
        assert error_analysis['FN'] == []
        assert error_analysis['TP'] == [('0', Span(4, 15)), ('0', Span(30,
                                                                       35)),
                                        ('1', Span(4, 16))]

        # Does not predict anything for a whole sentence therefore will have
        # perfect precision but bad recall (mainly testing the if not
        # getting anything for a sentence matters)
        sequence_labels_0 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
        test_collection['0']['sequence_labels'] = sequence_labels_0
        sequence_labels_1 = ['O', 'B', 'I', 'O', 'O']
        test_collection['1']['sequence_labels'] = sequence_labels_1
        recall, precision, f1, error_analysis = test_collection.exact_match_score(
            'sequence_labels')
        assert precision == 1.0
        assert recall == 1 / 3
        assert f1 == 0.5
        assert error_analysis['FP'] == []
        fn_error = sorted(error_analysis['FN'], key=lambda x: x[1].start)
        assert fn_error == [('0', Span(4, 15)), ('0', Span(30, 35))]
        assert error_analysis['TP'] == [('1', Span(4, 16))]

        # Handle the edge case of not getting anything
        sequence_labels_0 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
        test_collection['0']['sequence_labels'] = sequence_labels_0
        sequence_labels_1 = ['O', 'O', 'O', 'O', 'O']
        test_collection['1']['sequence_labels'] = sequence_labels_1
        recall, precision, f1, error_analysis = test_collection.exact_match_score(
            'sequence_labels')
        assert precision == 0.0
        assert recall == 0.0
        assert f1 == 0.0
        assert error_analysis['FP'] == []
        fn_error = sorted(error_analysis['FN'], key=lambda x: x[1].start)
        assert fn_error == [('0', Span(4, 15)), ('1', Span(4, 16)),
                            ('0', Span(30, 35))]
        assert error_analysis['TP'] == []

        # The case where the tokens and the text do not align
        not_align_example = self._target_text_not_align_example()
        # text = 'The laptop case; was awful'
        sequence_labels_align = ['O', 'B', 'I', 'O', 'O']
        test_collection.add(not_align_example)
        test_collection.tokenize(str.split)
        test_collection['inf']['sequence_labels'] = sequence_labels_align
        sequence_labels_0 = ['O', 'B', 'I', 'O', 'O', 'O', 'B', 'O', 'O']
        test_collection['0']['sequence_labels'] = sequence_labels_0
        sequence_labels_1 = ['O', 'B', 'I', 'O', 'O']
        test_collection['1']['sequence_labels'] = sequence_labels_1
        recall, precision, f1, error_analysis = test_collection.exact_match_score(
            'sequence_labels')
        assert recall == 3 / 4
        assert precision == 3 / 4
        assert f1 == 0.75
        assert error_analysis['FP'] == [('inf', Span(4, 16))]
        assert error_analysis['FN'] == [('inf', Span(4, 15))]
        tp_error = sorted(error_analysis['TP'], key=lambda x: x[1].start)
        assert tp_error == [('0', Span(4, 15)), ('1', Span(4, 16)),
                            ('0', Span(30, 35))]

        # This time it can get a perfect score as the token alignment will be
        # perfect
        test_collection.tokenize(spacy_tokenizer())
        sequence_labels_align = ['O', 'B', 'I', 'O', 'O', 'O']
        test_collection['inf']['sequence_labels'] = sequence_labels_align
        recall, precision, f1, error_analysis = test_collection.exact_match_score(
            'sequence_labels')
        assert recall == 1.0
        assert precision == 1.0
        assert f1 == 1.0
        assert error_analysis['FP'] == []
        assert error_analysis['FN'] == []
        tp_error = sorted(error_analysis['TP'], key=lambda x: x[1].end)
        assert tp_error == [('0', Span(4, 15)), ('inf', Span(4, 15)),
                            ('1', Span(4, 16)), ('0', Span(30, 35))]

        # Handle the case where one of the samples has no spans
        test_example = TargetText(text="I've had a bad day", text_id='50')
        other_examples = self._target_text_measure_examples()
        other_examples.append(test_example)
        test_collection = TargetTextCollection(other_examples)
        test_collection.tokenize(str.split)
        test_collection.sequence_labels()
        measures = test_collection.exact_match_score('sequence_labels')
        for index, measure in enumerate(measures):
            if index == 3:
                assert measure['FP'] == []
                assert measure['FN'] == []
                tp_error = sorted(measure['TP'], key=lambda x: x[1].end)
                assert tp_error == [('0', Span(4, 15)), ('1', Span(4, 16)),
                                    ('0', Span(30, 35))]
            else:
                assert measure == 1.0
        # Handle the case where on the samples has no spans but has predicted
        # there is a span there
        test_collection['50']['sequence_labels'] = ['B', 'I', 'O', 'O', 'O']
        recall, precision, f1, error_analysis = test_collection.exact_match_score(
            'sequence_labels')
        assert recall == 1.0
        assert precision == 3 / 4
        assert round(f1, 3) == 0.857
        assert error_analysis['FP'] == [('50', Span(start=0, end=8))]
        assert error_analysis['FN'] == []
        tp_error = sorted(error_analysis['TP'], key=lambda x: x[1].end)
        assert tp_error == [('0', Span(4, 15)), ('1', Span(4, 16)),
                            ('0', Span(30, 35))]
        # See if it can handle a collection that only contains no spans
        test_example = TargetText(text="I've had a bad day", text_id='50')
        test_collection = TargetTextCollection([test_example])
        test_collection.tokenize(str.split)
        test_collection.sequence_labels()
        measures = test_collection.exact_match_score('sequence_labels')
        for index, measure in enumerate(measures):
            if index == 3:
                assert measure['FP'] == []
                assert measure['FN'] == []
                assert measure['TP'] == []
            else:
                assert measure == 0.0
        # Handle the case the collection contains one spans but a mistake
        test_collection['50']['sequence_labels'] = ['B', 'I', 'O', 'O', 'O']
        measures = test_collection.exact_match_score('sequence_labels')
        for index, measure in enumerate(measures):
            if index == 3:
                assert measure['FP'] == [('50', Span(0, 8))]
                assert measure['FN'] == []
                assert measure['TP'] == []
            else:
                assert measure == 0.0
        # Should raise a KeyError if one of the TargetText instances does
        # not have a Span key
        del test_collection['50']._storage['spans']
        with pytest.raises(KeyError):
            test_collection.exact_match_score('sequence_labels')
        # should raise a KeyError if one of the TargetText instances does
        # not have a predicted sequence key
        test_collection = TargetTextCollection([self._target_text_example()])
        test_collection.tokenize(spacy_tokenizer())
        test_collection.sequence_labels()
        with pytest.raises(KeyError):
            measures = test_collection.exact_match_score('nothing')

        # Should raise a ValueError if there are multiple same true spans
        a = TargetText(text='hello how are you I am good',
                       text_id='1',
                       targets=['hello', 'hello'],
                       spans=[Span(0, 5), Span(0, 5)])
        test_collection = TargetTextCollection([a])
        test_collection.tokenize(str.split)
        test_collection['1']['sequence_labels'] = [
            'B', 'O', 'O', 'O', 'O', 'O', 'O'
        ]
        with pytest.raises(ValueError):
            test_collection.exact_match_score('sequence_labels')
示例#11
0
def dataset_target_extraction_statistics(collections: List[TargetTextCollection],
                                         lower_target: bool = True,
                                         target_key: str = 'targets',
                                         tokeniser: Callable[[str], List[str]]=spacy_tokenizer(),
                                         dataframe_format: bool = False,
                                         incl_sentence_statistics: bool = True
                                         ) -> List[Dict[str, Union[str,int,float]]]:
    '''
    :param collections: A list of collections
    :param lower_target: Whether to lower case the targets before counting them
    :param target_key: The key within each sample in each collection that contains 
                       the list of targets to be analysed. This can also be the 
                       predicted target key, which might be useful for error 
                       analysis.
    :param tokenizer: The tokenizer to use to split the target(s) into tokens. See 
                      for a module of comptabile tokenisers 
                      :py:mod:`target_extraction.tokenizers`. This is required 
                      to give statistics on target length.
    :param dataframe_format: If True instead of a list of dictionaries the 
                             return will be a pandas dataframe
    :param incl_sentence_statistics: If False statistics about the sentence
                                     will not be included. This is so that 
                                     the statistics can still be created for 
                                     datasets that have been anonymised.
    :returns: A list of dictionaries each containing the statistics for the 
              associated collection. Each dictionary will have the following 
              keys:
              1. Name -- this comes from the collection's name attribute
              2. No. Sentences -- number of sentences in the collection
              3. No. Sentences(t) -- number of sentence that contain 
                 targets.
              4. No. Targets -- number of targets
              5. No. Uniq Targets -- number of unique targets
              6. ATS -- Average Target per Sentence (ATS)
              7. ATS(t) -- ATS but where all sentences in the collection must 
                 contain at least one target.
              8. TL (1) -- Percentage of targets that are length 1 based on the 
                 number of tokens.
              9. TL (2) -- Percentage of targets that are length 2 based on the 
                 number of tokens.
              10. TL (3+) -- Percentage of targets that are length 3+ based on the 
                  number of tokens.
              11. Mean Sent L -- Mean sentence length based on the tokens provided 
                  by the `tokenized_text` key in each TargetText within the 
                  collections. If this key does not exist then the collection
                  will be tokenized using the given tokeniser argument.
              12. Mean Sent L(t) -- `Mean Sent L` but where all sentences in 
                  the collection must contain at least one target.
    '''
    dataset_stats: List[Dict[str, Union[str,int,float]]] = []
    for collection in collections:
        collection_stats = {}
        collection_stats['Name'] = collection.name
        collection_stats['No. Sentences'] = len(collection)
        collection_stats['No. Sentences(t)'] = len(collection.samples_with_targets())
        collection_stats['No. Targets'] = collection.number_targets()
        collection_stats['No. Uniq Targets'] = len(collection.target_count(lower=lower_target))
        collection_stats['ATS'] = round(average_target_per_sentences(collection, False), 2)
        collection_stats['ATS(t)'] = round(average_target_per_sentences(collection, True), 2)
        
        target_lengths = tokens_per_target(collection, target_key, tokeniser, normalise=True)
        collection_stats['TL 1 %'] = round(target_lengths[1] * 100, 2)
        collection_stats['TL 2 %'] = round(target_lengths[2] * 100, 2)
        three_plus = sum([fraction for token_length, fraction in target_lengths.items() 
                          if token_length > 2])
        collection_stats['TL 3+ %'] = round(three_plus * 100, 2)

        if not incl_sentence_statistics:
            dataset_stats.append(collection_stats)
            continue

        for samples_with_targets_only in [False, True]:
            if samples_with_targets_only:
                sentence_lengths = tokens_per_sentence(collection.samples_with_targets(),
                                                       tokeniser)
            else:
                sentence_lengths = tokens_per_sentence(collection, tokeniser)
            sentence_lengths_flattened = []
            for length, count in sentence_lengths.items():
                sentence_lengths_flattened.extend([length] * count)
            mean_sentence_length = round(statistics.mean(sentence_lengths_flattened), 2)
            
            if samples_with_targets_only:
                collection_stats['Mean Sentence Length(t)'] = mean_sentence_length
            else: 
                collection_stats['Mean Sentence Length'] = mean_sentence_length
        dataset_stats.append(collection_stats)
    if dataframe_format:
        return _statistics_to_dataframe(dataset_stats)
    return dataset_stats
    def test_read_from_file(self, lazy: bool, left_right_contexts: bool,
                            reverse_right_context: bool, incl_target: bool,
                            target_sequences: bool, position_embeddings: bool,
                            max_position_distance: int,
                            position_weights: bool):
        # Test that a ValueError is raised if left_right_contexts is False
        # and incl_target is True
        with pytest.raises(ValueError):
            TargetSentimentDatasetReader(lazy=lazy,
                                         incl_target=True,
                                         left_right_contexts=False,
                                         use_categories=True)
        # Test that a ValueError is raised if left_right_contexts is False
        # and reverse_right_context is True
        with pytest.raises(ValueError):
            TargetSentimentDatasetReader(lazy=lazy,
                                         reverse_right_context=True,
                                         left_right_contexts=False,
                                         use_categories=True)
        # Stop ValueErrors from being raised
        if reverse_right_context and not left_right_contexts:
            return
        if incl_target and not left_right_contexts:
            return
        reader = TargetSentimentDatasetReader(
            lazy=lazy,
            incl_target=incl_target,
            left_right_contexts=left_right_contexts,
            reverse_right_context=reverse_right_context,
            use_categories=True)
        data_dir = Path(__file__, '..', '..', '..', 'data', 'allen',
                        'dataset_readers', 'target_sentiment').resolve()
        tokenizer = spacy_tokenizer()

        # Test the targets case and the include target case with respect to the
        # left and right contexts
        text1 = "I charge it at night and skip taking the cord with me "\
                "because of the good battery life"
        tokens1 = tokenizer(text1)
        targets1 = ["cord", "battery life"]
        target_words1 = [tokenizer(target) for target in targets1]
        instance1 = {
            'text': text1,
            'text words': tokens1,
            'targets': targets1,
            'target words': target_words1,
            'target_sentiments': ["neutral", "positive"]
        }
        if left_right_contexts:
            left_texts = [
                "I charge it at night and skip taking the ",
                "I charge it at night and skip taking the cord with me because of the good "
            ]
            right_texts = [" with me because of the good battery life", ""]
            if incl_target:
                left_texts = [
                    "I charge it at night and skip taking the cord",
                    "I charge it at night and skip taking the cord with me because of the good battery life"
                ]
                right_texts = [
                    "cord with me because of the good battery life",
                    "battery life"
                ]
            if reverse_right_context:
                right_texts = ["life battery good the of because me with", ""]
                if incl_target:
                    right_texts = [
                        "life battery good the of because me with cord",
                        "life battery"
                    ]
            instance1['left_contexts'] = [
                tokenizer(text) for text in left_texts
            ]
            instance1['right_contexts'] = [
                tokenizer(text) for text in right_texts
            ]

        text2 = "it is of high quality, has a killer GUI, is extremely stable, "\
                "is highly expandable, is bundled with lots of very good "\
                "applications, is easy to use, and is absolutely gorgeous."
        tokens2 = tokenizer(text2)
        targets2 = ["quality", "GUI", "applications", "use"]
        target_words2 = [tokenizer(target) for target in targets2]
        instance2 = {
            'text': text2,
            'text words': tokens2,
            'targets': targets2,
            'target words': target_words2,
            'target_sentiments':
            ["positive", "positive", "positive", "positive"]
        }

        test_target_fp = Path(data_dir, 'target_sentiments.json').resolve()
        instances = ensure_list(reader.read(str(test_target_fp)))

        assert len(instances) == 2
        true_instances = [instance1, instance2]
        for i, instance in enumerate(instances):
            # Only look at the left and right context of the first instance
            if left_right_contexts and i == 1:
                continue
            fields = instance.fields
            true_instance = true_instances[i]
            assert true_instance["text words"] == [
                x.text for x in fields['tokens']
            ]
            for index, target_field in enumerate(fields['targets']):
                assert true_instance["target words"][index] == [
                    x.text for x in target_field
                ]
            assert true_instance['target_sentiments'] == fields[
                'target_sentiments'].labels

            assert true_instance["text"] == fields['metadata']["text"]
            assert true_instance["text words"] == fields['metadata'][
                "text words"]
            assert true_instance["targets"] == fields['metadata']["targets"]
            assert true_instance["target words"] == fields['metadata'][
                "target words"]
            if left_right_contexts:
                for index, left_field in enumerate(fields['left_contexts']):
                    assert true_instance["left_contexts"][index] == [
                        x.text for x in left_field
                    ]
                for index, right_field in enumerate(fields['right_contexts']):
                    assert true_instance["right_contexts"][index] == [
                        x.text for x in right_field
                    ]
                assert 6 == len(fields)
            else:
                assert 4 == len(fields)

        # Test the categories case
        reader = TargetSentimentDatasetReader(lazy=lazy,
                                              incl_target=False,
                                              left_right_contexts=False,
                                              use_categories=True)
        text1 = "Not only was the food outstanding, but the little perks were great."
        tokens1 = tokenizer(text1)
        instance1 = {
            'text': text1,
            'text words': tokens1,
            'categories': ["food", "service"],
            'category_sentiments': ["positive", "positive"]
        }

        text2 = "To be completely fair, the only redeeming factor was the food, "\
                "which was above average, but couldnt make up for all the other "\
                "deficiencies of Teodora."
        tokens2 = tokenizer(text2)
        instance2 = {
            'text': text2,
            'text words': tokens2,
            'categories': ["food", "anecdotes/miscellaneous"],
            'category_sentiments': ["positive", "negative"]
        }

        test_category_fp = Path(data_dir, 'category_sentiments.json').resolve()
        instances = ensure_list(reader.read(str(test_category_fp)))

        assert len(instances) == 2
        true_instances = [instance1, instance2]
        for i, instance in enumerate(instances):
            fields = instance.fields
            true_instance = true_instances[i]
            assert true_instance["text words"] == [
                x.text for x in fields['tokens']
            ]
            assert true_instance["categories"] == [
                x.text for x in fields['categories']
            ]
            assert true_instance['category_sentiments'] == fields[
                'category_sentiments'].labels

            assert true_instance["text"] == fields['metadata']["text"]
            assert true_instance["text words"] == fields['metadata'][
                "text words"]
            assert true_instance["categories"] == fields['metadata'][
                "categories"]
            assert 4 == len(fields)

        # Test the categories and target case
        reader = TargetSentimentDatasetReader(
            lazy=lazy,
            incl_target=False,
            left_right_contexts=left_right_contexts,
            reverse_right_context=reverse_right_context,
            use_categories=True)
        text1 = "We, there were four of us, arrived at noon - the place was "\
                "empty - and the staff acted like we were imposing on them and "\
                "they were very rude."
        tokens1 = tokenizer(text1)
        targets1 = ["staff"]
        target_words1 = [tokenizer(target) for target in targets1]
        instance1 = {
            'text': text1,
            'text words': tokens1,
            'targets': targets1,
            'target words': target_words1,
            'categories': ["SERVICE#GENERAL", "SOMETHING"],
            'target_sentiments': ["negative"]
        }
        if left_right_contexts:
            left_texts = [
                "We, there were four of us, arrived at noon - the place was empty - and the "
            ]
            right_texts = [
                " acted like we were imposing on them and they were very rude."
            ]
            if reverse_right_context:
                right_texts = [
                    ". rude very were they and them on imposing were we like acted"
                ]
            instance1['left_contexts'] = [
                tokenizer(text) for text in left_texts
            ]
            instance1['right_contexts'] = [
                tokenizer(text) for text in right_texts
            ]

        text2 = "The food was lousy - too sweet or too salty and the portions tiny."
        tokens2 = tokenizer(text2)
        targets2 = ["food", "portions"]
        target_words2 = [tokenizer(target) for target in targets2]
        instance2 = {
            'text': text2,
            'text words': tokens2,
            'targets': targets2,
            'target words': target_words2,
            'categories': ["FOOD#QUALITY", "FOOD#STYLE_OPTIONS"],
            'target_sentiments': ["negative", "negative"]
        }

        test_target_fp = Path(data_dir,
                              'target_category_sentiments.json').resolve()
        instances = ensure_list(reader.read(str(test_target_fp)))

        assert len(instances) == 2
        true_instances = [instance1, instance2]
        for i, instance in enumerate(instances):
            # Only look at the left and right context of the first instance
            if left_right_contexts and i == 1:
                continue
            fields = instance.fields
            true_instance = true_instances[i]
            assert true_instance["text words"] == [
                x.text for x in fields['tokens']
            ]
            for index, target_field in enumerate(fields['targets']):
                assert true_instance["target words"][index] == [
                    x.text for x in target_field
                ]
            assert true_instance['target_sentiments'] == fields[
                'target_sentiments'].labels
            assert true_instance["categories"] == [
                x.text for x in fields['categories']
            ]

            assert true_instance["text"] == fields['metadata']["text"]
            assert true_instance["text words"] == fields['metadata'][
                "text words"]
            assert true_instance["targets"] == fields['metadata']["targets"]
            assert true_instance["target words"] == fields['metadata'][
                "target words"]
            assert true_instance["categories"] == fields['metadata'][
                "categories"]
            if left_right_contexts:
                if left_right_contexts:
                    for index, left_field in enumerate(
                            fields['left_contexts']):
                        assert true_instance["left_contexts"][index] == [
                            x.text for x in left_field
                        ]
                    for index, right_field in enumerate(
                            fields['right_contexts']):
                        assert true_instance["right_contexts"][index] == [
                            x.text for x in right_field
                        ]
                assert 7 == len(fields)
            else:
                assert 5 == len(fields)
        # Test the case for the Left right contexts case where the spans are not
        # given
        reader = TargetSentimentDatasetReader(
            lazy=lazy,
            incl_target=False,
            left_right_contexts=left_right_contexts,
            reverse_right_context=reverse_right_context,
            use_categories=True)
        text_fp = Path(data_dir, 'just_text.json')
        with pytest.raises(ValueError):
            instances = ensure_list(reader.read(str(text_fp)))

        # Test the case for when we are not using the left right contexts
        # and no targets or categories are given
        reader = TargetSentimentDatasetReader(lazy=lazy,
                                              incl_target=False,
                                              left_right_contexts=False,
                                              reverse_right_context=False,
                                              use_categories=True)
        with pytest.raises(ValueError):
            instances = ensure_list(reader.read(str(text_fp)))

        # Test the target_sequences argument
        if left_right_contexts == True:
            pass
        elif (max_position_distance is not None
              and (not position_embeddings and not position_weights)):
            with pytest.raises(ValueError):
                reader = TargetSentimentDatasetReader(
                    lazy=lazy,
                    max_position_distance=max_position_distance,
                    position_embeddings=position_embeddings,
                    position_weights=position_weights)
        else:
            # Tests raises an error if the left_right_contexts is True
            with pytest.raises(ValueError):
                reader = TargetSentimentDatasetReader(lazy=lazy,
                                                      left_right_contexts=True,
                                                      target_sequences=True)
            text1 = 'The laptop case was great and awfulcover'
            targets1 = ['laptop case', 'case was great']
            spans1 = [[4, 15], [11, 25]]
            if not target_sequences and not position_embeddings and not position_weights:
                text1 = "Thelaptopcasewas great and awfulcover"
                targets1 = ["laptopcase", "casewas great"]
                spans1 = [[3, 13], [9, 22]]

            tokens1 = tokenizer(text1)
            target_words1 = [tokenizer(target) for target in targets1]
            target_sequences1 = [[[0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0,
                                                          0]],
                                 [[0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0],
                                  [0, 0, 0, 0, 1, 0, 0]]]
            position_embedding_seq1 = [['2', '1', '1', '2', '3', '4', '5'],
                                       ['3', '2', '1', '1', '1', '2', '3']]
            position_weights_seq1 = [[2, 1, 1, 2, 3, 4, 5],
                                     [3, 2, 1, 1, 1, 2, 3]]

            instance1 = {
                'text': text1,
                'text words': tokens1,
                'targets': targets1,
                'target words': target_words1,
                'spans': spans1,
                'target_sequences': target_sequences1,
                'target_sentiments': ["neutral", "positive"],
                'position_weights': position_weights_seq1,
                'position_embeddings': position_embedding_seq1
            }

            text2 = "it is of high quality , has a killer GUI"
            targets2 = ["quality", "GUI"]
            spans2 = [[14, 21], [37, 40]]
            if not target_sequences and not position_embeddings and not position_weights:
                text2 = "it is of high quality, has a killer GUI"
                spans2 = [[14, 21], [36, 39]]

            tokens2 = tokenizer(text2)
            target_words2 = [tokenizer(target) for target in targets2]
            target_sequences2 = [[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]],
                                 [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]]
            position_embedding_seq2 = [[
                '5', '4', '3', '2', '1', '2', '3', '4', '5', '6'
            ], ['10', '9', '8', '7', '6', '5', '4', '3', '2', '1']]
            position_weights_seq2 = [[5, 4, 3, 2, 1, 2, 3, 4, 5, 6],
                                     [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]]
            if max_position_distance is not None:
                position_embedding_seq2 = [[
                    '5', '4', '3', '2', '1', '2', '3', '4', '5', '5'
                ], ['5', '5', '5', '5', '5', '5', '4', '3', '2', '1']]
                position_weights_seq2 = [[5, 4, 3, 2, 1, 2, 3, 4, 5, 5],
                                         [5, 5, 5, 5, 5, 5, 4, 3, 2, 1]]

            instance2 = {
                'text': text2,
                'text words': tokens2,
                'targets': targets2,
                'target words': target_words2,
                'spans': spans2,
                'target_sequences': target_sequences2,
                'target_sentiments': ["positive", "positive"],
                'position_weights': position_weights_seq2,
                'position_embeddings': position_embedding_seq2
            }
            if not target_sequences:
                del instance1['target_sequences']
                del instance2['target_sequences']
            if not position_embeddings:
                del instance1['position_embeddings']
                del instance2['position_embeddings']
            if not position_weights:
                del instance1['position_weights']
                del instance2['position_weights']

            reader = TargetSentimentDatasetReader(
                lazy=lazy,
                target_sequences=target_sequences,
                position_embeddings=position_embeddings,
                position_weights=position_weights,
                max_position_distance=max_position_distance)
            test_target_fp = Path(
                data_dir, 'target_sentiment_target_sequences.json').resolve()
            instances = ensure_list(reader.read(str(test_target_fp)))

            assert len(instances) == 2
            true_instances = [instance1, instance2]
            for i, instance in enumerate(instances):
                fields = instance.fields
                true_instance = true_instances[i]
                assert true_instance["text words"] == [
                    x.text for x in fields['tokens']
                ]
                for index, target_field in enumerate(fields['targets']):
                    assert true_instance["target words"][index] == [
                        x.text for x in target_field
                    ]
                assert true_instance['target_sentiments'] == fields[
                    'target_sentiments'].labels
                assert true_instance["text"] == fields['metadata']["text"]
                assert true_instance["text words"] == fields['metadata'][
                    "text words"]
                assert true_instance["targets"] == fields['metadata'][
                    "targets"]
                assert true_instance["target words"] == fields['metadata'][
                    "target words"]
                number_fields = 4
                if position_embeddings:
                    number_fields += 1
                if target_sequences:
                    number_fields += 1
                if position_weights:
                    number_fields += 1
                assert number_fields == len(fields)
                if target_sequences:
                    for index, target_sequence in enumerate(
                            fields['target_sequences']):
                        true_array = true_instance["target_sequences"][index]
                        true_array = np.array(true_array)
                        assert np.array_equal(true_array,
                                              target_sequence.array)
                if position_embeddings:
                    for index, position_embedding_field in enumerate(
                            fields['position_embeddings']):
                        assert true_instance["position_embeddings"][index] == [
                            x.text for x in position_embedding_field
                        ]
                if position_weights:
                    position_weight_array = np.array(
                        true_instance["position_weights"])
                    assert np.array_equal(position_weight_array,
                                          fields['position_weights'].array)

        # Ensure raises error if the max_position_distance is less than 2
        with pytest.raises(ValueError):
            reader = TargetSentimentDatasetReader(
                lazy=lazy,
                target_sequences=target_sequences,
                position_embeddings=position_embeddings,
                position_weights=position_weights,
                max_position_distance=1)
            test_target_fp = Path(
                data_dir, 'target_sentiment_target_sequences.json').resolve()
            instances = ensure_list(reader.read(str(test_target_fp)))

        # Test the case of both target and category sentiments.
        reader = TargetSentimentDatasetReader(
            lazy=lazy,
            incl_target=False,
            left_right_contexts=left_right_contexts,
            reverse_right_context=reverse_right_context,
            use_categories=True)
        text1 = "We, there were four of us, arrived at noon - the place was "\
                "empty - and the staff acted like we were imposing on them and "\
                "they were very rude."
        tokens1 = tokenizer(text1)
        targets1 = ["staff"]
        target_words1 = [tokenizer(target) for target in targets1]
        instance1 = {
            'text': text1,
            'text words': tokens1,
            'targets': targets1,
            'target words': target_words1,
            'categories': ["SERVICE#GENERAL", "SOMETHING", "ANOTHER"],
            'target_sentiments': ["negative"],
            'category_sentiments': ["positive", "positive", "negative"]
        }
        if left_right_contexts:
            left_texts = [
                "We, there were four of us, arrived at noon - the place was empty - and the "
            ]
            right_texts = [
                " acted like we were imposing on them and they were very rude."
            ]
            if reverse_right_context:
                right_texts = [
                    ". rude very were they and them on imposing were we like acted"
                ]
            instance1['left_contexts'] = [
                tokenizer(text) for text in left_texts
            ]
            instance1['right_contexts'] = [
                tokenizer(text) for text in right_texts
            ]

        text2 = "The food was lousy - too sweet or too salty and the portions tiny."
        tokens2 = tokenizer(text2)
        targets2 = ["food", "portions"]
        target_words2 = [tokenizer(target) for target in targets2]
        instance2 = {
            'text': text2,
            'text words': tokens2,
            'targets': targets2,
            'target words': target_words2,
            'categories': ["FOOD#QUALITY", "FOOD#STYLE_OPTIONS"],
            'target_sentiments': ["negative", "negative"],
            'category_sentiments': ["positive", "neutral"]
        }

        test_target_fp = Path(
            data_dir, 'target_sentiments_category_sentiments.json').resolve()
        instances = ensure_list(reader.read(str(test_target_fp)))

        assert len(instances) == 2
        true_instances = [instance1, instance2]
        for i, instance in enumerate(instances):
            # Only look at the left and right context of the first instance
            if left_right_contexts and i == 1:
                continue
            fields = instance.fields
            true_instance = true_instances[i]
            assert true_instance["text words"] == [
                x.text for x in fields['tokens']
            ]
            for index, target_field in enumerate(fields['targets']):
                assert true_instance["target words"][index] == [
                    x.text for x in target_field
                ]
            assert true_instance['target_sentiments'] == fields[
                'target_sentiments'].labels
            assert true_instance["categories"] == [
                x.text for x in fields['categories']
            ]
            assert true_instance["category_sentiments"] == fields[
                'category_sentiments'].labels

            assert true_instance["text"] == fields['metadata']["text"]
            assert true_instance["text words"] == fields['metadata'][
                "text words"]
            assert true_instance["targets"] == fields['metadata']["targets"]
            assert true_instance["target words"] == fields['metadata'][
                "target words"]
            assert true_instance["categories"] == fields['metadata'][
                "categories"]
            if left_right_contexts:
                if left_right_contexts:
                    for index, left_field in enumerate(
                            fields['left_contexts']):
                        assert true_instance["left_contexts"][index] == [
                            x.text for x in left_field
                        ]
                    for index, right_field in enumerate(
                            fields['right_contexts']):
                        assert true_instance["right_contexts"][index] == [
                            x.text for x in right_field
                        ]
                assert 8 == len(fields)
            else:
                assert 6 == len(fields)