예제 #1
0
    def initialize(self, resources: Resources, configs: Config):
        self.resources = resources
        self.config = Config(configs, self.default_configs())

        # TODO: At the time of writing, no way in texar to set encoder in
        # `texar.torch.modules.classifiers.BertClassifier`. Should not ideally
        # be changing a private variable.
        # pylint: disable=protected-access
        BERTClassifier._ENCODER_CLASS = BERTEncoder
        # pylint: enable=protected-access

        cache_dir = os.path.join(os.path.dirname(__file__),
                                 self.config.model_dir)

        self.device = torch.device('cuda:0') \
            if torch.cuda.is_available() else torch.device('cpu')

        self.model = BERTClassifier(
            pretrained_model_name=self.config.pretrained_model_name,
            cache_dir=cache_dir,
            hparams=self.config).to(self.device)

        self.tokenizer = BERTTokenizer(
            pretrained_model_name=self.config.pretrained_model_name,
            cache_dir=cache_dir,
            hparams=None)
예제 #2
0
 def initialize(self, resources: Resources, configs: Config):
     super().initialize(resources, configs)
     if not self.configs.tokenizer_configs.pretrained_model_name:
         raise ValueError("Please specify a pretrained bert model")
     self.tokenizer = BERTTokenizer(
         cache_dir=None,
         hparams=self.configs.tokenizer_configs,
     )
     self.aligner = DiffAligner()
     self.__do_lower_case = self.configs.tokenizer_configs.do_lower_case
예제 #3
0
    def test_encode_text(self):
        tokenizer = BERTTokenizer.load(self.vocab_file)

        text_1 = u"He is very happy"
        text_2 = u"unwanted, running"

        text_1_ids = tokenizer.map_text_to_id(text_1)
        text_2_ids = tokenizer.map_text_to_id(text_2)

        cls_token_id = tokenizer.map_token_to_id(tokenizer.cls_token)
        sep_token_id = tokenizer.map_token_to_id(tokenizer.sep_token)

        input_ids, segment_ids, input_mask = \
            tokenizer.encode_text(text_1, None, 4)

        self.assertListEqual(input_ids,
                             [cls_token_id] + text_1_ids[:2] + [sep_token_id])
        self.assertListEqual(segment_ids, [0, 0, 0, 0])
        self.assertListEqual(input_mask, [1, 1, 1, 1])

        input_ids, segment_ids, input_mask = \
            tokenizer.encode_text(text_1, text_2, 7)

        self.assertListEqual(input_ids, [cls_token_id] + text_1_ids[:2] +
                             [sep_token_id] + text_2_ids[:2] + [sep_token_id])
        self.assertListEqual(segment_ids, [0, 0, 0, 0, 1, 1, 1])
        self.assertListEqual(input_mask, [1, 1, 1, 1, 1, 1, 1])
예제 #4
0
    def default_configs(cls):
        """Returns the configuration with default values.

        Here:
          - `tokenizer_configs` contains all default
              hyper-parameters in
              :class:`~texar.torch.data.tokenizer.bert_tokenizer.BERTTokenizer`,
              this processor will pass on all the configurations to the
              tokenizer to create the tokenizer instance.

          - `segment_unit` contains an Annotation entry type used to split the
              text into smaller units. For example, setting this to
              `ft.onto.base_ontology.Sentence` will make this tokenizer do
              tokenization on a sentence base, which could be more efficient
              when the alignment is used.

          - `token_source` contains entry name of where the tokens come from.
               For example, setting this to `ft.onto.base_ontology.Token` will
               make this tokenizer split the sub-word based on this token. The
               default value will use `ft.onto.base_ontology.Token`. If this
               value is set to None, then it will use `word_tokenization`
               function of this class to do tokenization.

        Note that if `segment_unit` or `token_source` is provided, the
        :meth:`~forte.processors.base.base_processor.BaseProcessor.check_record`
         will check if certain types are written before this processor.

        Returns: Default configuration value for the tokenizer.
        """
        return {
            "tokenizer_configs": BERTTokenizer.default_hparams(),
            "segment_unit": None,
            "token_source": "ft.onto.base_ontology.Token",
            "@no_typecheck": "token_source",
        }
예제 #5
0
    def test_tokenize(self):
        tokenizer = BERTTokenizer.load(self.vocab_file)

        tokens = tokenizer.map_text_to_token(u"UNwant\u00E9d,running")
        self.assertListEqual(tokens,
                             ["un", "##want", "##ed", ",", "runn", "##ing"])

        ids = tokenizer.map_token_to_id(tokens)
        self.assertListEqual(ids, [7, 4, 5, 10, 8, 9])
예제 #6
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        self.resources = resources
        self.config = Config(configs, self.default_configs())
        if not self.config.pretrained_model_name:
            raise ValueError("Please specify a pretrained bert model")
        self.tokenizer = BERTTokenizer(
            pretrained_model_name=self.config.pretrained_model_name,
            cache_dir=None,
            hparams=None,
        )
예제 #7
0
    def test_save_load(self):
        tokenizer = BERTTokenizer.load(self.vocab_file)

        before_tokens = tokenizer.map_text_to_id(
            u"He is very happy, UNwant\u00E9d,running")

        with tempfile.TemporaryDirectory() as tmpdirname:
            tokenizer.save(tmpdirname)
            tokenizer = tokenizer.load(tmpdirname)

        after_tokens = tokenizer.map_text_to_id(
            u"He is very happy, UNwant\u00E9d,running")
        self.assertListEqual(before_tokens, after_tokens)
예제 #8
0
    def test_add_tokens(self):
        tokenizer = BERTTokenizer.load(self.vocab_file)

        vocab_size = tokenizer.vocab_size
        all_size = len(tokenizer)

        self.assertNotEqual(vocab_size, 0)
        self.assertEqual(vocab_size, all_size)

        new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
        added_toks = tokenizer.add_tokens(new_toks)
        vocab_size_2 = tokenizer.vocab_size
        all_size_2 = len(tokenizer)

        self.assertNotEqual(vocab_size_2, 0)
        self.assertEqual(vocab_size, vocab_size_2)
        self.assertEqual(added_toks, len(new_toks))
        self.assertEqual(all_size_2, all_size + len(new_toks))

        tokens = tokenizer.map_text_to_id(
            "aaaaabbbbbb low cccccccccdddddddd l")
        self.assertGreaterEqual(len(tokens), 4)
        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)

        new_toks_2 = {
            'eos_token': ">>>>|||<||<<|<<",
            'pad_token': "<<<<<|||>|>>>>|>"
        }
        added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
        vocab_size_3 = tokenizer.vocab_size
        all_size_3 = len(tokenizer)

        self.assertNotEqual(vocab_size_3, 0)
        self.assertEqual(vocab_size, vocab_size_3)
        self.assertEqual(added_toks_2, len(new_toks_2))
        self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))

        tokens = tokenizer.map_text_to_id(
            ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd "
            "<<<<<|||>|>>>>|> l")

        self.assertGreaterEqual(len(tokens), 6)
        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[0], tokens[1])
        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[-2], tokens[-3])
        self.assertEqual(tokens[0],
                         tokenizer.map_token_to_id(tokenizer.eos_token))
        self.assertEqual(tokens[-2],
                         tokenizer.map_token_to_id(tokenizer.pad_token))
예제 #9
0
 def initialize(self, config: Union[Dict, Config]):
     # pylint: disable=attribute-defined-outside-init
     super().initialize(config=config)
     self.tokenizer = BERTTokenizer(
         pretrained_model_name=self.config.pretrained_model_name,
         cache_dir=None,
         hparams=None,
     )
     predefined_dict = [key for key, _ in self.tokenizer.vocab.items()]
     self.predefined_vocab(predefined_dict)
     if not self.vocab:
         raise AttributeError("Vocabulary is required in SubwordExtractor.")
     self.vocab.mark_special_element(self.tokenizer.vocab["[PAD]"], "PAD")
     self.vocab.mark_special_element(self.tokenizer.vocab["[UNK]"], "UNK")
예제 #10
0
    def test_pickle(self):
        tokenizer = BERTTokenizer.load(self.vocab_file)
        self.assertIsNotNone(tokenizer)

        text = u"Munich and Berlin are nice cities"
        subwords = tokenizer.map_text_to_token(text)

        with tempfile.TemporaryDirectory() as tmpdirname:
            filename = os.path.join(tmpdirname, u"tokenizer.bin")
            with open(filename, "wb") as f:
                pickle.dump(tokenizer, f)
            with open(filename, "rb") as f:
                tokenizer_new = pickle.load(f)

        subwords_loaded = tokenizer_new.map_text_to_token(text)

        self.assertListEqual(subwords, subwords_loaded)
예제 #11
0
    def test_encode_decode(self):
        tokenizer = BERTTokenizer.load(self.vocab_file)

        input_text = u"UNwant\u00E9d,running"
        output_text = u"unwanted, running"

        tokens = tokenizer.map_text_to_token(input_text)
        ids = tokenizer.map_token_to_id(tokens)
        ids_2 = tokenizer.map_text_to_id(input_text)
        self.assertListEqual(ids, ids_2)

        tokens_2 = tokenizer.map_id_to_token(ids)
        text_2 = tokenizer.map_id_to_text(ids)

        self.assertEqual(text_2, output_text)

        self.assertNotEqual(len(tokens_2), 0)
        self.assertIsInstance(text_2, str)
예제 #12
0
 def test_model_loading(self):
     for pretrained_model_name in BERTTokenizer.available_checkpoints():
         tokenizer = BERTTokenizer(
             pretrained_model_name=pretrained_model_name)
         _ = tokenizer.map_text_to_token(u"UNwant\u00E9d,running")
예제 #13
0
class BertRerankingProcessor(MultiPackProcessor):
    def initialize(self, resources: Resources, configs: Config):
        self.resources = resources
        self.config = Config(configs, self.default_configs())

        # TODO: At the time of writing, no way in texar to set encoder in
        # `texar.torch.modules.classifiers.BertClassifier`. Should not ideally
        # be changing a private variable.
        # pylint: disable=protected-access
        BERTClassifier._ENCODER_CLASS = BERTEncoder
        # pylint: enable=protected-access

        cache_dir = os.path.join(os.path.dirname(__file__),
                                 self.config.model_dir)

        self.device = torch.device('cuda:0') \
            if torch.cuda.is_available() else torch.device('cpu')

        self.model = BERTClassifier(
            pretrained_model_name=self.config.pretrained_model_name,
            cache_dir=cache_dir,
            hparams=self.config).to(self.device)

        self.tokenizer = BERTTokenizer(
            pretrained_model_name=self.config.pretrained_model_name,
            cache_dir=cache_dir,
            hparams=None)

    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        configs = super().default_configs()
        pretrained_model_name = "bert-large-uncased"
        configs.update({
            "size":
            5,
            "query_pack_name":
            "query",
            "field":
            "content",
            "pretrained_model_name":
            pretrained_model_name,
            "model_dir":
            os.path.join(os.path.dirname(__file__), "models"),
            "max_seq_length":
            512
        })
        return configs

    def _process(self, input_pack: MultiPack):
        max_len = self.config.max_seq_length
        query_pack_name = self.config.query_pack_name

        query_pack = input_pack.get_pack(self.config.query_pack_name)
        query_entry = list(query_pack.get(Query))[0]
        query_text = query_pack.text

        packs = {}
        for doc_id in input_pack.pack_names:
            if doc_id == query_pack_name:
                continue

            pack = input_pack.get_pack(doc_id)
            document_text = pack.text

            # BERT Inference
            input_ids, segment_ids, input_mask = [
                torch.LongTensor(item).unsqueeze(0).to(self.device) for item in
                self.tokenizer.encode_text(query_text, document_text, max_len)
            ]

            seq_length = (input_mask == 1).sum(dim=-1)
            logits, _ = self.model(input_ids, seq_length, segment_ids)
            preds = torch.nn.functional.softmax(torch.Tensor(logits), dim=1)

            score = preds.detach().tolist()[0][1]

            query_entry.update_results({doc_id: score})
            packs[doc_id] = pack
예제 #14
0
class SubwordTokenizer(PackProcessor):
    """
    Subword Tokenizer using pretrained Bert model.
    """
    def __init__(self):
        super().__init__()
        self.tokenizer: BERTTokenizer = None
        self.aligner: DiffAligner = None
        self.__do_lower_case = True

    # pylint: disable=attribute-defined-outside-init,unused-argument
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        if not self.configs.tokenizer_configs.pretrained_model_name:
            raise ValueError("Please specify a pretrained bert model")
        self.tokenizer = BERTTokenizer(
            cache_dir=None,
            hparams=self.configs.tokenizer_configs,
        )
        self.aligner = DiffAligner()
        self.__do_lower_case = self.configs.tokenizer_configs.do_lower_case

    def _process(self, input_pack: DataPack):
        assert self.tokenizer is not None
        assert self.aligner is not None

        if self.configs.token_source is not None:
            # Use provided token source.
            token: Annotation
            for token in input_pack.get(self.configs.token_source):
                assert isinstance(token, Annotation)
                self.__add_subwords(
                    input_pack,
                    token.text  # type: ignore
                    if self.__do_lower_case else token.text,  # type: ignore
                    token.begin,  # type: ignore
                )
        elif self.configs.segment_unit is not None:
            # If token source not provide, try to use provided segments.
            segment: Annotation
            for segment in input_pack.get(self.configs.segment_unit):
                self._segment(
                    input_pack,
                    segment.text,
                    segment.begin  # type: ignore
                )
        else:
            # Use the whole data pack, maybe less efficient in some cases.
            self._segment(input_pack, input_pack.text, 0)

    def _word_tokenization(self,
                           text: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
        """
        This function should tokenize the text and return the tokenization
        results in the form of a word and the span of each word. A span is the
        begin and end of this word, indexed from 0, and end = begin + length
        of the word.

        By default, this calls the Texar's BasicTokenizer and then align the
        result back. You can implement this function if you prefer a
        different tokenizer.

        Args:
            text: Input text to be tokenized.

        Returns: A iterator of tokenization result in the form of triplets of
        (word, (begin, end)).
        """
        basic_tokens: List[str] = self.tokenizer.basic_tokenizer.tokenize(
            text, never_split=self.tokenizer.all_special_tokens)
        token_spans = self.aligner.align_with_segments(text, basic_tokens)

        for t, span in zip(basic_tokens, token_spans):
            if span is not None:
                yield t, span

    def _segment(self, pack: DataPack, text: str, segment_offset: int):
        if self.tokenizer.do_basic_tokenize:
            for token, (token_begin, _) in self._word_tokenization(text):
                assert token is not None
                self.__add_subwords(pack, text, token_begin + segment_offset)
        else:
            self.__add_subwords(pack, text, segment_offset)

    def __add_subwords(self, pack: DataPack, text: str, text_offset: int):
        if self.__do_lower_case:
            lower_text = text.lower()
            # See this https://bugs.python.org/issue17252 to understand why this
            # is checked here. tl;dr sometimes lower casing special unicode
            # string will result in a change of length due to unicode NFD.
            if len(lower_text) == len(text):
                text = text.lower()

        for (
                subword,
                begin,
                end,
        ) in self.tokenizer.wordpiece_tokenizer.tokenize_with_span(text):
            subword_token = Subword(pack, begin + text_offset,
                                    end + text_offset)
            if subword == self.tokenizer.wordpiece_tokenizer.unk_token:
                subword_token.is_unk = True
            subword_token.is_first_segment = not subword.startswith("##")
            # pylint: disable=protected-access
            subword_token.vocab_id = self.tokenizer._map_token_to_id(subword)

    def record(self, record_meta: Dict[str, Set[str]]):
        r"""Method to add output type record of current processor
        to :attr:`forte.data.data_pack.Meta.record`.

        Args:
            record_meta: the field in the data pack storing type records needed
                in for consistency checking.
        Returns:

        """
        record_meta["ft.onto.base_ontology.Subword"] = {
            "is_unk",
            "is_first_segment",
            "vocab_id",
        }

    def expected_types_and_attributes(self) -> Dict[str, Set[str]]:
        r"""Method to add expected type for current processor input which
        would be checked before running the processor if
        the pipeline is initialized with
        `enforce_consistency=True` or
        :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
        the pipeline.
        """
        expected_types: Dict[str, Set[str]] = {}
        if self.configs.token_source is not None:
            expected_types[self.configs.token_source] = set()
        elif self.configs.segment_unit is not None:
            expected_types[self.configs.segment_unit] = set()
        return expected_types

    @classmethod
    def default_configs(cls):
        """Returns the configuration with default values.

        Here:
          - `tokenizer_configs` contains all default
              hyper-parameters in
              :class:`~texar.torch.data.tokenizer.bert_tokenizer.BERTTokenizer`,
              this processor will pass on all the configurations to the
              tokenizer to create the tokenizer instance.

          - `segment_unit` contains an Annotation entry type used to split the
              text into smaller units. For example, setting this to
              `ft.onto.base_ontology.Sentence` will make this tokenizer do
              tokenization on a sentence base, which could be more efficient
              when the alignment is used.

          - `token_source` contains entry name of where the tokens come from.
               For example, setting this to `ft.onto.base_ontology.Token` will
               make this tokenizer split the sub-word based on this token. The
               default value will use `ft.onto.base_ontology.Token`. If this
               value is set to None, then it will use `word_tokenization`
               function of this class to do tokenization.

        Note that if `segment_unit` or `token_source` is provided, the
        :meth:`~forte.processors.base.base_processor.BaseProcessor.check_record`
         will check if certain types are written before this processor.

        Returns: Default configuration value for the tokenizer.
        """
        return {
            "tokenizer_configs": BERTTokenizer.default_hparams(),
            "segment_unit": None,
            "token_source": "ft.onto.base_ontology.Token",
            "@no_typecheck": "token_source",
        }