Exemplo n.º 1
0
class BertPreprocessor(Preprocessor):
    """Preprocessor for BERT embedding.

    This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT
    as embedding. Currently only single sequence classification is supported.

    Source: https://github.com/google-research/bert_keras
    """
    def __init__(self, pretrained_model_path: str, **kwargs):

        super().__init__(**kwargs)

        info = hub.Module(spec=pretrained_model_path)(
            signature="tokenization_info", as_dict=True)

        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run(
                [info["vocab_file"], info["do_lower_case"]])

        # Create the tokenizer with the vocabulary of the pretrained model
        self._tokenizer = FullTokenizer(vocab_file=vocab_file,
                                        do_lower_case=do_lower_case)

        basic_tokens = self._tokenizer.convert_tokens_to_ids(
            ["[CLS]", "[SEP]"])
        self._CLS_token = basic_tokens[0]
        self._SEP_token = basic_tokens[1]

    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""

        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def _padding_sentence(self):
        """Return a zero length sentence to pad last batch.

        :return: Three sequences of zeros (tokens, masks, segment ids).
        """

        return [0] * self._max_seq_len, [0] * self._max_seq_len, [
            0
        ] * self._max_seq_len

    def tokenize(self, text_a: str, text_b: str = None):
        """Convert sequence(s) of words into sequence(s) of tokens and also compute the masking- and segment ids.

        For further details please read BERT paper.

        :param text_a: First sequence
        :param text_b: Second sequence
        :return: The sequence of tokens, masks and segment ids.
        """

        input_ids = [0] * self._max_seq_len
        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        input_mask = [0] * self._max_seq_len
        # The segment ids are 0 for text_a and 1 for text_b
        input_segment_ids = [0] * self._max_seq_len

        tokens_a = self._tokenizer.tokenize(text_a)
        tokens_b = None
        if text_b:
            tokens_b = self._tokenizer.tokenize(text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b, self._max_seq_len - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self._max_seq_len - 2:
                tokens_a = tokens_a[0:(self._max_seq_len - 2)]

        idx = 0
        input_ids[idx] = self._CLS_token
        idx += 1

        for element in self._tokenizer.convert_tokens_to_ids(tokens_a):
            input_ids[idx] = element
            input_mask[idx] = 1
            idx += 1

        if tokens_b:
            for element in self._tokenizer.convert_tokens_to_ids(tokens_b):
                input_ids[idx] = element
                input_mask[idx] = 1
                input_segment_ids[idx] = 1
                idx += 1

        input_ids[idx] = self._SEP_token

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        for i in range(idx + 1):
            input_mask[i] = 1

        # safety check
        assert len(input_ids) == self._max_seq_len
        assert len(input_mask) == self._max_seq_len
        assert len(input_segment_ids) == self._max_seq_len

        return input_ids, input_mask, input_segment_ids

    def fit(self, texts: List[str]) -> 'BertPreprocessor':
        """This function does nothing in case of BERT but must be implemented.

        :param texts: -
        :return: self
        """

        return self

    def transform(self, examples: List[InputExample]) -> list:
        """Transform sequences of words into sequences of tokens, masks and segment ids.

        Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole
        sequence belongs together.

        For further details please read BERT paper.

        :param texts: The sequences of texts.
        :return: The sequences of tokens, masks and segment ids.
        """

        input_ids, input_masks, segment_ids = [], [], []

        for i, example in enumerate(examples):
            input_id, input_mask, segment_id = self.tokenize(
                text_a=example.text_a, text_b=example.text_b)
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)

        return [
            np.array(input_ids),
            np.array(input_masks),
            np.array(segment_ids)
        ]

    def inverse_transform(self, sequences: np.ndarray):
        """Transform sequences of tokens back to sequences of words (sentences).

        :param sequences: The sequences of tokens.
        :return: The sequences of words
        """

        return self._tokenizer.convert_ids_to_tokens(sequences)
def main(args):
    checkpoint_path = os.path.join(args.model_dir, "bert_model.ckpt")

    bert_config = BertConfig.from_json_file(
        os.path.join(args.model_dir, "bert_config.json"))
    bert_config.hidden_dropout_prob = 0.0
    bert_config.attention_probs_dropout_prob = 0.0

    batch_size = args.batch_size
    max_seq_len = args.max_seq_length
    tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32

    if args.effective_mode:
        # load transformer weights *before* building the computation graph
        weights_value = load_transformer_weights(checkpoint_path, bert_config,
                                                 batch_size, max_seq_len,
                                                 tf_dtype)

    # build model
    input_ids_placeholder = tf.placeholder(shape=[batch_size, max_seq_len],
                                           dtype=tf.int32,
                                           name="input_ids")
    input_mask_placeholder = tf.placeholder(shape=[batch_size, max_seq_len],
                                            dtype=tf.int32,
                                            name="input_mask")
    attention_mask_placeholder = tf.placeholder(
        shape=[batch_size, max_seq_len, max_seq_len],
        dtype=tf_dtype,
        name="attention_mask")
    input_embedding_placeholder = tf.placeholder(
        shape=[batch_size, max_seq_len, bert_config.hidden_size],
        dtype=tf_dtype,
        name="input_embedding")
    embedding_table_placeholder = tf.placeholder(
        shape=[bert_config.vocab_size, bert_config.hidden_size],
        dtype=tf_dtype,
        name="embedding_table")
    transformer_output_placeholder = tf.placeholder(
        shape=[batch_size, max_seq_len, bert_config.hidden_size],
        dtype=tf_dtype,
        name="transformer_output")

    embedding_layer = EmbeddingLayer(bert_config, input_ids_placeholder)
    if args.effective_mode:
        effective_transformer_layer = EffectiveTransformerLayer(
            batch_size, max_seq_len, bert_config, attention_mask_placeholder,
            input_mask_placeholder, input_embedding_placeholder, weights_value)
    else:
        standard_transformer_layer = TransformerLayer(
            bert_config, input_embedding_placeholder, input_mask_placeholder)
    output_layer = LanguageModelOutputLayer(bert_config,
                                            transformer_output_placeholder,
                                            embedding_table_placeholder)

    # model saver
    variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
    saver = tf.train.Saver(variables_to_restore)

    config = tf.ConfigProto()
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    with tf.Session(config=config) as sess:
        # restore embedding layer and output layer
        saver.restore(sess, checkpoint_path)

        # process input data
        tokenizer = FullTokenizer(
            vocab_file=os.path.join(args.model_dir, 'vocab.txt'))
        input_ids, input_mask, input_text, to_predict = process_data(
            batch_size, max_seq_len, tokenizer)
        input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32)
        input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32)

        # predict
        begin = datetime.now()
        input_embedding, embedding_table = sess.run(
            [
                embedding_layer.get_embedding_output(),
                embedding_layer.get_embedding_table()
            ],
            feed_dict={input_ids_placeholder: input_ids})
        attention_mask = sess.run(
            create_attention_mask_from_input_mask(input_ids_tensor,
                                                  input_mask_tensor))
        if args.effective_mode:
            transformer_output = sess.run(
                effective_transformer_layer.get_transformer_output(),
                feed_dict={
                    input_embedding_placeholder: input_embedding,
                    attention_mask_placeholder: attention_mask,
                    input_mask_placeholder: input_mask
                })
        else:
            transformer_output = sess.run(
                standard_transformer_layer.get_transformer_output(),
                feed_dict={
                    input_embedding_placeholder: input_embedding,
                    attention_mask_placeholder: attention_mask,
                    input_mask_placeholder: input_mask
                })
        probs = sess.run(output_layer.get_predict_probs(),
                         feed_dict={
                             transformer_output_placeholder:
                             transformer_output,
                             embedding_table_placeholder: embedding_table
                         })
        end = datetime.now()
        print("time cost: ", (end - begin).total_seconds(), "s")

        # choose top k answers
        k = 5
        top_ids = np.argsort(-probs, axis=2)[:, :, :k]

        batch_results = []
        for sid, blank_ids in enumerate(to_predict):
            sentence_results = []
            for cid in blank_ids:
                result = []
                for idx in top_ids[sid][cid]:
                    token = tokenizer.convert_ids_to_tokens([idx])[0]
                    result.append((token, probs[sid][cid][idx]))
                sentence_results.append(result)
            batch_results.append(sentence_results)

    for text, blank_ids, sentence_results in zip(input_text, to_predict,
                                                 batch_results):
        print("Q:", text)
        for cid, result in zip(blank_ids, sentence_results):
            print("A:", result)
Exemplo n.º 3
0
class BertPreprocessor(Preprocessor):
    """Preprocessor for BERT embedding.

    This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT
    as embedding. Currently only single sequence classification is supported.
    """

    def __init__(self,
                 pretrained_model_path: str,
                 **kwargs):
        super().__init__(**kwargs)

        info = hub.Module(spec=pretrained_model_path)(signature="tokenization_info", as_dict=True)

        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run(
                [
                    info["vocab_file"],
                    info["do_lower_case"]
                ]
            )

        # Create the tokenizer with the vocabulary of the pretrained model
        self._tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

        basic_tokens = self._tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"])
        self._CLS_token = basic_tokens[0]
        self._SEP_token = basic_tokens[1]

    def _padding_sentence(self):
        """Return a zero length sentence to pad last batch.

        :return: Three sequences of zeros (tokens, masks, segment ids).
        """

        return [0] * self._max_seq_len, [0] * self._max_seq_len, [0] * self._max_seq_len

    def tokenize(self, text: str):
        """Convert a sequence of words into a sequence of tokens and also compute the masking- and segment ids.

        For further details please read BERT paper.

        :param text: The sequence of words.
        :return: The sequence of tokens, masks and segment ids.
        """

        input_ids = [0] * self._max_seq_len
        input_mask = [0] * self._max_seq_len
        input_segment_ids = [0] * self._max_seq_len

        tokens_input = self._tokenizer.tokenize(text)

        # if too long cut to size (the first token will be [CLS], the last [SEP])
        if len(tokens_input) > self._max_seq_len - 2:
            tokens_input = tokens_input[0: (self._max_seq_len - 2)]

        idx = 0
        input_ids[idx] = self._CLS_token
        idx += 1

        for element in self._tokenizer.convert_tokens_to_ids(tokens_input):
            input_ids[idx] = element
            idx += 1

        input_ids[idx] = self._SEP_token

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        for i in range(idx + 1):
            input_mask[i] = 1

        # safety check
        assert len(input_ids) == self._max_seq_len
        assert len(input_mask) == self._max_seq_len
        assert len(input_segment_ids) == self._max_seq_len

        return input_ids, input_mask, input_segment_ids

    def fit(self, texts: List[str]) -> 'BertPreprocessor':
        """This function does nothing in case of BERT but must be implemented.

        :param texts: -
        :return: self
        """

        return self

    def transform(self, texts: List[str]) -> list:
        """Transform sequences of words into sequences of tokens, masks and segment ids.

        Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole
        sequence belongs together.

        For further details please read BERT paper.

        :param texts: The sequences of texts.
        :return: The sequences of tokens, masks and segment ids.
        """
        
        input_masks = np.empty([len(texts), self._max_seq_len], dtype=np.int64)
        segment_ids = np.empty([len(texts), self._max_seq_len], dtype=np.int64)

        # input_ids, input_masks, segment_ids = [], [], []
        
        input_ids, input_masks, segment_ids = zip(*Pool(processes=8).map(self.tokenize, texts))

        # for i, text in enumerate(texts):
            # input_ids[i], input_masks[i], segment_ids[i] = self.tokenize(text=text)
            # input_id, input_mask, segment_id = self.tokenize(text=text)
            # input_ids.append(input_id)
            # input_masks.append(input_mask)
            # segment_ids.append(segment_id)

        # return [np.array(input_ids), np.array(input_masks), np.array(segment_ids)]
        return [input_ids, input_masks, segment_ids]

    def inverse_transform(self, sequences: np.ndarray):
        """Transform sequences of tokens back to sequences of words (sentences).

        :param sequences: The sequences of tokens.
        :return: The sequences of words
        """

        return self._tokenizer.convert_ids_to_tokens(sequences)