Пример #1
0
    def test_copying(self):
        input_texts = ['Turing was born in 1912 in London .']
        tag_strs = ['KEEP'] * 8
        tags = [tagging.Tag(s) for s in tag_strs]
        task = tagging.EditingTask(input_texts)
        self.assertEqual(task.realize_output(tags), input_texts[0])

        # With multiple inputs.
        input_texts = ['a B', 'c D e', 'f g']
        tag_strs = ['KEEP'] * 7
        tags = [tagging.Tag(s) for s in tag_strs]
        task = tagging.EditingTask(input_texts)
        self.assertEqual(task.realize_output(tags), 'a B c D e f g')
Пример #2
0
    def test_casing(self):
        input_texts = ['A b .', 'Cc dd .']
        # Test lowcasing after a period has been removed.
        tag_strs = ['KEEP', 'KEEP', 'DELETE', 'KEEP', 'KEEP', 'KEEP']
        tags = [tagging.Tag(s) for s in tag_strs]
        task = tagging.EditingTask(input_texts)
        self.assertEqual(task.realize_output(tags), 'A b cc dd .')

        # Test upcasing after the first upcased token has been removed.
        tag_strs = ['KEEP', 'KEEP', 'KEEP', 'DELETE', 'KEEP', 'KEEP']
        tags = [tagging.Tag(s) for s in tag_strs]
        task = tagging.EditingTask(input_texts)
        self.assertEqual(task.realize_output(tags), 'A b . Dd .')
Пример #3
0
    def compute_tags(self, task, target):
        """Computes tags needed for converting the source into the target.

    Args:
      task: tagging.EditingTask that specifies the input.
      target: Target text.

    Returns:
      List of tagging.Tag objects. If the source couldn't be converted into the
      target via tagging, returns an empty list.
    """
        target_tokens = utils.get_token_list(target.lower())
        tags = self._compute_tags_fixed_order(task.source_tokens,
                                              target_tokens)
        # If conversion fails, try to obtain the target after swapping the source
        # order.
        if not tags and len(task.sources) == 2 and self._do_swap:
            swapped_task = tagging.EditingTask(task.sources[::-1])
            tags = self._compute_tags_fixed_order(swapped_task.source_tokens,
                                                  target_tokens)
            if tags:
                tags = (tags[swapped_task.first_tokens[1]:] +
                        tags[:swapped_task.first_tokens[1]])
                # We assume that the last token (typically a period) is never deleted,
                # so we can overwrite the tag_type with SWAP (which keeps the token,
                # moving it and the sentence it's part of to the end).
                tags[task.first_tokens[1] - 1].tag_type = tagging.TagType.SWAP
        return tags
Пример #4
0
 def test_deletion(self):
     input_texts = ['Turing was born in 1912 in London .']
     tag_strs = [
         'KEEP', 'DELETE', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'DELETE'
     ]
     tags = [tagging.Tag(s) for s in tag_strs]
     task = tagging.EditingTask(input_texts)
     # "was" and "." should have been removed.
     self.assertEqual(task.realize_output(tags),
                      'Turing born in 1912 in London')
Пример #5
0
 def test_no_match(self):
     input_texts = ['Turing was born in 1912 .', 'Turing died in 1954 .']
     target = 'Turing was born in 1912 and died in 1954 .'
     task = tagging.EditingTask(input_texts)
     phrase_vocabulary = ['but']
     converter = tagging_converter.TaggingConverter(phrase_vocabulary)
     tags = converter.compute_tags(task, target)
     # Vocabulary doesn't contain "and" so the inputs can't be converted to the
     # target.
     self.assertFalse(tags)
Пример #6
0
 def test_phrase_adding(self):
     input_texts = ['Turing was born in 1912 in London .']
     tag_strs = [
         'KEEP', 'DELETE|, a pioneer in TCS ,', 'KEEP', 'KEEP', 'KEEP',
         'KEEP', 'KEEP', 'KEEP'
     ]
     tags = [tagging.Tag(s) for s in tag_strs]
     task = tagging.EditingTask(input_texts)
     self.assertEqual(
         task.realize_output(tags),
         'Turing , a pioneer in TCS , born in 1912 in London .')
Пример #7
0
 def test_swapping_complex(self):
     input_texts = [
         'Dylan won Nobel prize .', 'Dylan is an American musician .'
     ]
     tag_strs = [
         'DELETE', 'KEEP', 'KEEP', 'KEEP', 'SWAP', 'KEEP', 'DELETE|,',
         'KEEP', 'KEEP', 'KEEP', 'DELETE|,'
     ]
     tags = [tagging.Tag(s) for s in tag_strs]
     task = tagging.EditingTask(input_texts)
     self.assertEqual(task.realize_output(tags),
                      'Dylan , an American musician , won Nobel prize .')
Пример #8
0
 def test_swapping(self):
     input_texts = [
         'Turing was born in 1912 in London .', 'Turing died in 1954 .'
     ]
     tag_strs = [
         'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'SWAP',
         'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP'
     ]
     tags = [tagging.Tag(s) for s in tag_strs]
     task = tagging.EditingTask(input_texts)
     self.assertEqual(
         task.realize_output(tags),
         'Turing died in 1954 . Turing was born in 1912 in London .')
Пример #9
0
 def test_invalid_swapping(self):
     # When SWAP tag is assigned to other than the last token of the first of two
     # sentences, it should be treated as KEEP.
     input_texts = [
         'Turing was born in 1912 in London .', 'Turing died in 1954 .'
     ]
     tag_strs = [
         'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP', 'SWAP', 'KEEP',
         'KEEP', 'KEEP', 'KEEP', 'KEEP', 'KEEP'
     ]
     tags = [tagging.Tag(s) for s in tag_strs]
     task = tagging.EditingTask(input_texts)
     self.assertEqual(
         task.realize_output(tags),
         'Turing was born in 1912 in London . Turing died in 1954 .')
    def test_realize_output_in_order(self):
        """
            Test for when source tokens occur
            in the same relative order in the 
            target string
        """
        editing_task = tagging.EditingTask(["word1 word2 <::::> word3 "])

        tags_str = ['KEEP|0', 'KEEP|1', 'KEEP|and', 'DELETE', 'KEEP|3']
        tags = [tagging.Tag(tag) for tag in tags_str]

        result = editing_task.realize_output([tags])

        expected = "word1 word2 and word3 "

        self.assertEqual(expected, result)
Пример #11
0
    def test_compute_tags_out_of_order(self):
        """
            Test for when the source tokens
            do not occur in the same relative order
        """
        dummy_phrase_vocabulary = ['and']
        editing_task = tagging.EditingTask([" word1 word2 <::::> word3 "])
        converter = TaggingConverter(dummy_phrase_vocabulary)

        result = [
            str(tag) for tag in converter.compute_tags(
                editing_task, "word2 word1 and word3 ")
        ]

        expected = ['KEEP|1', 'KEEP|0', 'KEEP|and', 'DELETE', 'KEEP|3']

        self.assertEqual(expected, result)
Пример #12
0
    def test_compute_tags_infeasible(self):
        """
            Test for when the target cannot
            be constructed by the given
            edit vocab and source tokens
        """
        dummy_phrase_vocabulary = ['and']
        editing_task = tagging.EditingTask([" word1 word2 <::::> word3 "])
        converter = TaggingConverter(dummy_phrase_vocabulary)

        result = [
            str(tag) for tag in converter.compute_tags(
                editing_task, "word2 word1 but word3 ")
        ]

        expected = []

        self.assertEqual(expected, result)
Пример #13
0
    def _get_embeddings(self, text):
        """Get BERT embeddings for input text.
    
    Args:
      text: List of input texts.
    
    Returns:
      4-tuple of input_ids, input_mask, segment_ids, and 
      token_start_indices
    """
        tokens, token_start_indices = self._split_to_wordpieces(
            tagging.EditingTask(text).source_tokens)
        tokens = self._truncate_list(tokens)

        input_tokens = ['[CLS]'] + tokens + ['[SEP]']
        input_ids = self._tokenizer.convert_tokens_to_ids(input_tokens)
        input_mask = [1] * len(input_ids)
        segment_ids = [0] * len(input_ids)

        return input_ids, input_mask, segment_ids, token_start_indices
Пример #14
0
 def test_wrong_number_of_tags(self):
     input_texts = ['1 2']
     tags = [tagging.Tag('KEEP')]
     task = tagging.EditingTask(input_texts)
     with self.assertRaises(ValueError):
         task.realize_output(tags)
Пример #15
0
 def test_matching_conversion(self, input_texts, target, phrase_vocabulary,
                              target_tags):
     task = tagging.EditingTask(input_texts)
     converter = tagging_converter.TaggingConverter(phrase_vocabulary)
     tags = converter.compute_tags(task, target)
     self.assertEqual(tags_to_str(tags), tags_to_str(target_tags))
Пример #16
0
    def build_bert_example(
            self,
            sources,
            target=None,
            use_arbitrary_target_ids_for_infeasible_examples=False):
        """Constructs a BERT Example.

    Args:
      sources: List of source texts.
      target: Target text or None when building an example during inference.
      use_arbitrary_target_ids_for_infeasible_examples: Whether to build an
        example with arbitrary target ids even if the target can't be obtained
        via tagging.

    Returns:
      BertExample, or None if the conversion from text to tags was infeasible
      and use_arbitrary_target_ids_for_infeasible_examples == False.
    """
        # Compute target labels.
        task = tagging.EditingTask(sources)
        if target is not None:
            tags = self._converter.compute_tags(task, target)
            if not tags:
                if use_arbitrary_target_ids_for_infeasible_examples:
                    # Create a tag sequence [KEEP, DELETE, KEEP, DELETE, ...] which is
                    # unlikely to be predicted by chance.
                    tags = [
                        tagging.Tag('KEEP') if i %
                        2 == 0 else tagging.Tag('DELETE')
                        for i, _ in enumerate(task.source_tokens)
                    ]
                else:
                    return None
        else:
            # If target is not provided, we set all target labels to KEEP.
            tags = [tagging.Tag('KEEP') for _ in task.source_tokens]
        labels = [self._label_map[str(tag)] for tag in tags]

        tokens, labels, token_start_indices = self._split_to_wordpieces(
            task.source_tokens, labels)

        tokens = self._truncate_list(tokens)
        labels = self._truncate_list(labels)

        input_tokens = ['[CLS]'] + tokens + ['[SEP]']
        labels_mask = [0] + [1] * len(labels) + [0]
        labels = [0] + labels + [0]

        input_ids = self._tokenizer.convert_tokens_to_ids(input_tokens)
        input_mask = [1] * len(input_ids)
        segment_ids = [0] * len(input_ids)

        example = BertExample(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              labels=labels,
                              labels_mask=labels_mask,
                              token_start_indices=token_start_indices,
                              task=task,
                              default_label=self._keep_tag_id)
        example.pad_to_max_length(self._max_seq_length, self._pad_id)
        return example