def tokenize_text_pairs_for_bert(text_pairs: List[Tuple[str, str]], bert_tokenizer: FullTokenizer) -> \ List[Tuple[Sequence[int], int]]: res = [] for left_text, right_text in text_pairs: tokenized_left_text = bert_tokenizer.tokenize(left_text) if tokenized_left_text[0] != '[CLS]': tokenized_left_text = ['[CLS]'] + tokenized_left_text if tokenized_left_text[-1] != '[SEP]': tokenized_left_text = tokenized_left_text + ['[SEP]'] tokenized_right_text = bert_tokenizer.tokenize(right_text) if tokenized_right_text[0] == '[CLS]': tokenized_right_text = tokenized_right_text[1:] if tokenized_right_text[-1] == '[SEP]': tokenized_right_text = tokenized_right_text[0:-1] tokenized_text = tokenized_left_text + tokenized_right_text if len(tokenized_text) > MAX_SEQ_LENGTH: warnings.warn( "The text pair `{0}` - `{1}` contains too many sub-tokens!". format(left_text, right_text)) res.append((array.array("l"), 0)) else: token_IDs = bert_tokenizer.convert_tokens_to_ids(tokenized_text) res.append((array.array("l", token_IDs), len(tokenized_left_text))) del tokenized_left_text, tokenized_right_text, tokenized_text return res
class IntentDetection: def __init__(self): self.MAX_SEQ_LEN = 38 self.modelDir = 'saved_model/1' self.vocabDir = 'config/vocab.txt' self.classes = ['PlayMusic', 'AddToPlaylist', 'RateBook', 'SearchScreeningEvent', 'BookRestaurant', 'GetWeather', 'SearchCreativeWork'] self.tokenizer = FullTokenizer(vocab_file=self.vocabDir) print("============load model start=============") self.model = self.loadModel() print("============load model success=============") def loadModel(self): return tf.keras.models.load_model(self.modelDir) def predict(self, sentence): pred_tokens = self.tokenizer.tokenize(sentence) pred_tokens = ["[CLS]"] + pred_tokens + ["[SEP]"] pred_token_ids = list(self.tokenizer.convert_tokens_to_ids(pred_tokens)) pred_token_ids = pred_token_ids + [0]*(self.MAX_SEQ_LEN-len(pred_token_ids)) #pred_token_ids = np.array([pred_token_ids,]) pred_token_ids = np.array(pred_token_ids) pred_token_ids = np.expand_dims(pred_token_ids, axis=0) predictions = self.model.predict(pred_token_ids).argmax(axis=-1) return self.classes[predictions[0]]
def test_direct_keras_to_stock_compare(self): from tests.ext.modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint bert_config = BertConfig.from_json_file(self.bert_config_file) tokenizer = FullTokenizer( vocab_file=os.path.join(self.bert_ckpt_dir, "vocab.txt")) # prepare input max_seq_len = 6 input_str = "Hello, Bert!" input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) input_mask = [1] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) input_ids = np.array([input_ids], dtype=np.int32) input_mask = np.array([input_mask], dtype=np.int32) token_type_ids = np.array([token_type_ids], dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) s_res = self.predict_on_stock_model(input_ids, input_mask, token_type_ids) k_res = self.predict_on_keras_model(input_ids, input_mask, token_type_ids) np.set_printoptions(precision=9, threshold=20, linewidth=200, sign="+", floatmode="fixed") print("s_res", s_res.shape) print("k_res", k_res.shape) print("s_res:\n {}".format(s_res[0, :2, :10]), s_res.dtype) print("k_res:\n {}".format(k_res[0, :2, :10]), k_res.dtype) adiff = np.abs(s_res - k_res).flatten() print("diff:", np.max(adiff), np.argmax(adiff)) self.assertTrue(np.allclose(s_res, k_res, atol=1e-6))
return np.array(x), np.array(y) def _pad(self, ids): x = [] for input_ids in ids: input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)] input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids)) x.append(np.array(input_ids)) return np.array(x) tokenizer = FullTokenizer(vocab_file=os.path.join(main_path+"uncased_L-12_H-768_A-12/vocab.txt")) tokenizer.tokenize("I can't wait to visit Bulgaria again!") classes = train.labels.unique().tolist() data = IntentDetectionData(train, test, tokenizer, classes, max_seq_len=128) responses=pd.read_csv(main_path+'response.csv') # import our chat-bot intents file import json with open(main_path+'newintents2.json') as json_data:
class TfHubBert(Model): def __init__(self, max_seq_length, trainable, *args, **kwargs): super().__init__(*args, **kwargs) self.bert_layer = hub.KerasLayer( "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=trainable) vocab_file = self.bert_layer.resolved_object.vocab_file.asset_path.numpy( ) do_lower_case = self.bert_layer.resolved_object.do_lower_case.numpy() self.tokenizer = FullTokenizer(vocab_file, do_lower_case) self.max_seq_length = max_seq_length """ self.input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids") self.input_mask_ = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask") self.segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids") """ def prep(self, s, get='id'): stokens = self.tokenizer.tokenize(s) stokens = ["[CLS]"] + stokens + ["[SEP]"] if get == 'id': input_ids = get_ids(stokens, self.tokenizer, self.max_seq_length) return input_ids elif get == 'mask': input_masks = get_masks(stokens, self.max_seq_length) return input_masks else: input_segments = get_segments(stokens, self.max_seq_length) return input_segments def call(self, inputs, training=None, mask=None): pooled_output, _ = self.bert_layer([inputs[0], inputs[1], inputs[2]], training=training) return pooled_output def text_to_bert_input(self, text): stokens1 = self.tokenizer.tokenize(text) tokens = list(chunks(stokens1, utils.BERT_SEQ_LENGTH)) input_ids1 = [] input_masks1 = [] input_segments1 = [] for stok in tokens: stokens1 = ["[CLS]"] + stok + ["[SEP]"] input_ids1.append( get_ids(stokens1, self.tokenizer, self.max_seq_length)) input_masks1.append(get_masks(stokens1, self.max_seq_length)) input_segments1.append(get_segments(stokens1, self.max_seq_length)) return input_ids1, input_masks1, input_segments1 def dataframe_to_bert_input(self, df): input_word_ids = [] input_mask = [] segment_ids = [] ys = [] for i, row in df.iterrows(): a, b, c = self.text_to_bert_input(row["opinion"]) ys = ys + [row["outcome"]] * len(a) """ input_word_ids.append(a) input_mask.append(b) segment_ids.append(c) """ input_word_ids = input_word_ids + a input_mask = input_mask + b segment_ids = segment_ids + c input_word_ids = np.array(input_word_ids) input_mask = np.array(input_mask) segment_ids = np.array(segment_ids) return [input_word_ids, input_mask, segment_ids], np.array(ys) def get_predictor(self): def bert_predict(text): if isinstance(text, list): res = [] for t in text: a, b, c = self.text_to_bert_input(t) x = [np.array([a]), np.array([b]), np.array([c])] res.append(self.predict(x)[0]) return np.array(res) else: a, b, c = self.text_to_bert_input(text) x = [np.array([a]), np.array([b]), np.array([c])] return self.predict(x) return bert_predict def compile(self, optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=None, loss_weights=None, sample_weight_mode=None, weighted_metrics=None, **kwargs): super().compile( optimizer=optimizer, loss=loss, metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
bert_model_name="uncased_L-12_H-768_A-12" bert_ckpt_dir = os.path.join("model/", bert_model_name) tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) print('Loading the model') model= load_model('saved_models/tensorflow_10000.h5', custom_objects={'BertModelLayer': BertModelLayer}) print('Model is loaded') print(model.summary()) test=pd.read_csv('articles-validation-bypublisher.csv') conlist=test['content'].tolist() idlist=test['id'].tolist() print('conlist', len(conlist),'idlist',len(idlist)) resList=[] for id, content in tqdm(zip(idlist, conlist)): pred_tokens = tokenizer.tokenize(content) pred_tokens = ["[CLS]"] + pred_tokens + ["[SEP]"] pred_token_ids = list(tokenizer.convert_tokens_to_ids(pred_tokens)) if len(pred_token_ids) >= 512: pred_token_ids = pred_token_ids[:512] else: pred_token_ids = pred_token_ids + [0] * (512 - len(pred_token_ids)) pred_token_ids = [pred_token_ids] pred_token_ids = np.array(pred_token_ids) predictions = model.predict(pred_token_ids).argmax(axis=-1) pred = 'true' if predictions[0] else 'false' resList.append(str(id) + ' ' + pred)
def _pad(self, ids): x = [] for input_ids in ids: input_ids = input_ids[:min(len(input_ids), self.max_seq_len -2)] input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids)) x.append(np.array(input_ids)) return np.array(x) tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, 'vocab.txt')) t = tokenizer.tokenize('ಶುಭ ದಿನ') print(t) ds = tokenizer.convert_tokens_to_ids(t) print(ds) def create_model(max_seq_len, bert_ckpt_file): with tf.io.gfile.GFile(bert_config_file, 'r') as reader: bc = StockBertConfig.from_json_string(reader.read()) bert_params = map_stock_config_to_params(bc) bert_params.adapter_size = None bert = BertModelLayer.from_params(bert_params, name='bert') input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name='input_ids') print("--------------->", input_ids)
x.append(token_ids) y.append(self.classes.index(label)) return np.array(x), np.array(y) def _pad(self, ids): x = [] for input_ids in ids: input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)] input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids)) x.append(np.array(input_ids)) return np.array(x) tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) print(tokenizer.tokenize("I can't wait to visit Bulgaria again!")) tokens = tokenizer.tokenize("I can't wait to visit Bulgaria again!") print(tokenizer.convert_tokens_to_ids(tokens)) def create_model(max_seq_len, bert_ckpt_file): with tf.io.gfile.GFile(bert_config_file, "r") as reader: bc = StockBertConfig.from_json_string(reader.read()) bert_params = map_stock_config_to_params(bc) bert_params.adapter_size = None bert = BertModelLayer.from_params(bert_params, name="bert") input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids") bert_output = bert(input_ids)