Пример #1
0
    def test_sequence_long(self):
        """
        Test case sequences that are too long should be truncated
        :return:
        """
        sut = PreprocessorBertTokeniser(max_feature_len=5, tokeniser=None)
        sut.item = ["THE", "dog", "ate", "a", "biscuit"]
        expected = ["[CLS]", "THE", "dog", "ate", "[SEP]"]

        # Act
        sut.sequence_pad()

        # Assert
        self.assertSequenceEqual(expected, sut.item)
Пример #2
0
    def test_sequence_short(self):
        """
        Test case  sequences that are too short should be padded
        :return:
        """
        sut = PreprocessorBertTokeniser(max_feature_len=5, tokeniser=None)
        sut.item = ["THE"]
        expected = ["[CLS]", "THE", "[PAD]", "[PAD]", "[SEP]"]

        # Act
        sut.sequence_pad()

        # Assert
        self.assertSequenceEqual(expected, sut.item)
    def get_preprocessor(self):
        self._logger.info("Retrieving Tokeniser")
        tokeniser = BertTokenizer.from_pretrained(self._bert_model_name, do_lower_case=self._token_lower_case)
        preprocessor = PreprocessorBertTokeniser(max_feature_len=self._max_seq_len, tokeniser=tokeniser)
        self._logger.info("Completed retrieving Tokeniser")

        return preprocessor
 def get_preprocessor(self):
     tokeniser = BertTokenizer.from_pretrained(
         self._bert_model_name, do_lower_case=self._token_lower_case)
     preprocessor = PreprocessorBertTokeniser(
         max_feature_len=self._max_seq_len, tokeniser=tokeniser)
     return preprocessor