class BertPreprocessor(Preprocessor): """Preprocessor for BERT embedding. This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT as embedding. Currently only single sequence classification is supported. Source: https://github.com/google-research/bert_keras """ def __init__(self, pretrained_model_path: str, **kwargs): super().__init__(**kwargs) info = hub.Module(spec=pretrained_model_path)( signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run( [info["vocab_file"], info["do_lower_case"]]) # Create the tokenizer with the vocabulary of the pretrained model self._tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) basic_tokens = self._tokenizer.convert_tokens_to_ids( ["[CLS]", "[SEP]"]) self._CLS_token = basic_tokens[0] self._SEP_token = basic_tokens[1] def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def _padding_sentence(self): """Return a zero length sentence to pad last batch. :return: Three sequences of zeros (tokens, masks, segment ids). """ return [0] * self._max_seq_len, [0] * self._max_seq_len, [ 0 ] * self._max_seq_len def tokenize(self, text_a: str, text_b: str = None): """Convert sequence(s) of words into sequence(s) of tokens and also compute the masking- and segment ids. For further details please read BERT paper. :param text_a: First sequence :param text_b: Second sequence :return: The sequence of tokens, masks and segment ids. """ input_ids = [0] * self._max_seq_len # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. input_mask = [0] * self._max_seq_len # The segment ids are 0 for text_a and 1 for text_b input_segment_ids = [0] * self._max_seq_len tokens_a = self._tokenizer.tokenize(text_a) tokens_b = None if text_b: tokens_b = self._tokenizer.tokenize(text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b, self._max_seq_len - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self._max_seq_len - 2: tokens_a = tokens_a[0:(self._max_seq_len - 2)] idx = 0 input_ids[idx] = self._CLS_token idx += 1 for element in self._tokenizer.convert_tokens_to_ids(tokens_a): input_ids[idx] = element input_mask[idx] = 1 idx += 1 if tokens_b: for element in self._tokenizer.convert_tokens_to_ids(tokens_b): input_ids[idx] = element input_mask[idx] = 1 input_segment_ids[idx] = 1 idx += 1 input_ids[idx] = self._SEP_token # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. for i in range(idx + 1): input_mask[i] = 1 # safety check assert len(input_ids) == self._max_seq_len assert len(input_mask) == self._max_seq_len assert len(input_segment_ids) == self._max_seq_len return input_ids, input_mask, input_segment_ids def fit(self, texts: List[str]) -> 'BertPreprocessor': """This function does nothing in case of BERT but must be implemented. :param texts: - :return: self """ return self def transform(self, examples: List[InputExample]) -> list: """Transform sequences of words into sequences of tokens, masks and segment ids. Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole sequence belongs together. For further details please read BERT paper. :param texts: The sequences of texts. :return: The sequences of tokens, masks and segment ids. """ input_ids, input_masks, segment_ids = [], [], [] for i, example in enumerate(examples): input_id, input_mask, segment_id = self.tokenize( text_a=example.text_a, text_b=example.text_b) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) return [ np.array(input_ids), np.array(input_masks), np.array(segment_ids) ] def inverse_transform(self, sequences: np.ndarray): """Transform sequences of tokens back to sequences of words (sentences). :param sequences: The sequences of tokens. :return: The sequences of words """ return self._tokenizer.convert_ids_to_tokens(sequences)
def main(args): checkpoint_path = os.path.join(args.model_dir, "bert_model.ckpt") bert_config = BertConfig.from_json_file( os.path.join(args.model_dir, "bert_config.json")) bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 batch_size = args.batch_size max_seq_len = args.max_seq_length tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32 if args.effective_mode: # load transformer weights *before* building the computation graph weights_value = load_transformer_weights(checkpoint_path, bert_config, batch_size, max_seq_len, tf_dtype) # build model input_ids_placeholder = tf.placeholder(shape=[batch_size, max_seq_len], dtype=tf.int32, name="input_ids") input_mask_placeholder = tf.placeholder(shape=[batch_size, max_seq_len], dtype=tf.int32, name="input_mask") attention_mask_placeholder = tf.placeholder( shape=[batch_size, max_seq_len, max_seq_len], dtype=tf_dtype, name="attention_mask") input_embedding_placeholder = tf.placeholder( shape=[batch_size, max_seq_len, bert_config.hidden_size], dtype=tf_dtype, name="input_embedding") embedding_table_placeholder = tf.placeholder( shape=[bert_config.vocab_size, bert_config.hidden_size], dtype=tf_dtype, name="embedding_table") transformer_output_placeholder = tf.placeholder( shape=[batch_size, max_seq_len, bert_config.hidden_size], dtype=tf_dtype, name="transformer_output") embedding_layer = EmbeddingLayer(bert_config, input_ids_placeholder) if args.effective_mode: effective_transformer_layer = EffectiveTransformerLayer( batch_size, max_seq_len, bert_config, attention_mask_placeholder, input_mask_placeholder, input_embedding_placeholder, weights_value) else: standard_transformer_layer = TransformerLayer( bert_config, input_embedding_placeholder, input_mask_placeholder) output_layer = LanguageModelOutputLayer(bert_config, transformer_output_placeholder, embedding_table_placeholder) # model saver variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) saver = tf.train.Saver(variables_to_restore) config = tf.ConfigProto() config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Session(config=config) as sess: # restore embedding layer and output layer saver.restore(sess, checkpoint_path) # process input data tokenizer = FullTokenizer( vocab_file=os.path.join(args.model_dir, 'vocab.txt')) input_ids, input_mask, input_text, to_predict = process_data( batch_size, max_seq_len, tokenizer) input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32) input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32) # predict begin = datetime.now() input_embedding, embedding_table = sess.run( [ embedding_layer.get_embedding_output(), embedding_layer.get_embedding_table() ], feed_dict={input_ids_placeholder: input_ids}) attention_mask = sess.run( create_attention_mask_from_input_mask(input_ids_tensor, input_mask_tensor)) if args.effective_mode: transformer_output = sess.run( effective_transformer_layer.get_transformer_output(), feed_dict={ input_embedding_placeholder: input_embedding, attention_mask_placeholder: attention_mask, input_mask_placeholder: input_mask }) else: transformer_output = sess.run( standard_transformer_layer.get_transformer_output(), feed_dict={ input_embedding_placeholder: input_embedding, attention_mask_placeholder: attention_mask, input_mask_placeholder: input_mask }) probs = sess.run(output_layer.get_predict_probs(), feed_dict={ transformer_output_placeholder: transformer_output, embedding_table_placeholder: embedding_table }) end = datetime.now() print("time cost: ", (end - begin).total_seconds(), "s") # choose top k answers k = 5 top_ids = np.argsort(-probs, axis=2)[:, :, :k] batch_results = [] for sid, blank_ids in enumerate(to_predict): sentence_results = [] for cid in blank_ids: result = [] for idx in top_ids[sid][cid]: token = tokenizer.convert_ids_to_tokens([idx])[0] result.append((token, probs[sid][cid][idx])) sentence_results.append(result) batch_results.append(sentence_results) for text, blank_ids, sentence_results in zip(input_text, to_predict, batch_results): print("Q:", text) for cid, result in zip(blank_ids, sentence_results): print("A:", result)
class BertPreprocessor(Preprocessor): """Preprocessor for BERT embedding. This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT as embedding. Currently only single sequence classification is supported. """ def __init__(self, pretrained_model_path: str, **kwargs): super().__init__(**kwargs) info = hub.Module(spec=pretrained_model_path)(signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run( [ info["vocab_file"], info["do_lower_case"] ] ) # Create the tokenizer with the vocabulary of the pretrained model self._tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) basic_tokens = self._tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"]) self._CLS_token = basic_tokens[0] self._SEP_token = basic_tokens[1] def _padding_sentence(self): """Return a zero length sentence to pad last batch. :return: Three sequences of zeros (tokens, masks, segment ids). """ return [0] * self._max_seq_len, [0] * self._max_seq_len, [0] * self._max_seq_len def tokenize(self, text: str): """Convert a sequence of words into a sequence of tokens and also compute the masking- and segment ids. For further details please read BERT paper. :param text: The sequence of words. :return: The sequence of tokens, masks and segment ids. """ input_ids = [0] * self._max_seq_len input_mask = [0] * self._max_seq_len input_segment_ids = [0] * self._max_seq_len tokens_input = self._tokenizer.tokenize(text) # if too long cut to size (the first token will be [CLS], the last [SEP]) if len(tokens_input) > self._max_seq_len - 2: tokens_input = tokens_input[0: (self._max_seq_len - 2)] idx = 0 input_ids[idx] = self._CLS_token idx += 1 for element in self._tokenizer.convert_tokens_to_ids(tokens_input): input_ids[idx] = element idx += 1 input_ids[idx] = self._SEP_token # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. for i in range(idx + 1): input_mask[i] = 1 # safety check assert len(input_ids) == self._max_seq_len assert len(input_mask) == self._max_seq_len assert len(input_segment_ids) == self._max_seq_len return input_ids, input_mask, input_segment_ids def fit(self, texts: List[str]) -> 'BertPreprocessor': """This function does nothing in case of BERT but must be implemented. :param texts: - :return: self """ return self def transform(self, texts: List[str]) -> list: """Transform sequences of words into sequences of tokens, masks and segment ids. Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole sequence belongs together. For further details please read BERT paper. :param texts: The sequences of texts. :return: The sequences of tokens, masks and segment ids. """ input_masks = np.empty([len(texts), self._max_seq_len], dtype=np.int64) segment_ids = np.empty([len(texts), self._max_seq_len], dtype=np.int64) # input_ids, input_masks, segment_ids = [], [], [] input_ids, input_masks, segment_ids = zip(*Pool(processes=8).map(self.tokenize, texts)) # for i, text in enumerate(texts): # input_ids[i], input_masks[i], segment_ids[i] = self.tokenize(text=text) # input_id, input_mask, segment_id = self.tokenize(text=text) # input_ids.append(input_id) # input_masks.append(input_mask) # segment_ids.append(segment_id) # return [np.array(input_ids), np.array(input_masks), np.array(segment_ids)] return [input_ids, input_masks, segment_ids] def inverse_transform(self, sequences: np.ndarray): """Transform sequences of tokens back to sequences of words (sentences). :param sequences: The sequences of tokens. :return: The sequences of words """ return self._tokenizer.convert_ids_to_tokens(sequences)