class IntentDetection: def __init__(self): self.MAX_SEQ_LEN = 38 self.modelDir = 'saved_model/1' self.vocabDir = 'config/vocab.txt' self.classes = ['PlayMusic', 'AddToPlaylist', 'RateBook', 'SearchScreeningEvent', 'BookRestaurant', 'GetWeather', 'SearchCreativeWork'] self.tokenizer = FullTokenizer(vocab_file=self.vocabDir) print("============load model start=============") self.model = self.loadModel() print("============load model success=============") def loadModel(self): return tf.keras.models.load_model(self.modelDir) def predict(self, sentence): pred_tokens = self.tokenizer.tokenize(sentence) pred_tokens = ["[CLS]"] + pred_tokens + ["[SEP]"] pred_token_ids = list(self.tokenizer.convert_tokens_to_ids(pred_tokens)) pred_token_ids = pred_token_ids + [0]*(self.MAX_SEQ_LEN-len(pred_token_ids)) #pred_token_ids = np.array([pred_token_ids,]) pred_token_ids = np.array(pred_token_ids) pred_token_ids = np.expand_dims(pred_token_ids, axis=0) predictions = self.model.predict(pred_token_ids).argmax(axis=-1) return self.classes[predictions[0]]
def tokenize_text_pairs_for_bert(text_pairs: List[Tuple[str, str]], bert_tokenizer: FullTokenizer) -> \ List[Tuple[Sequence[int], int]]: res = [] for left_text, right_text in text_pairs: tokenized_left_text = bert_tokenizer.tokenize(left_text) if tokenized_left_text[0] != '[CLS]': tokenized_left_text = ['[CLS]'] + tokenized_left_text if tokenized_left_text[-1] != '[SEP]': tokenized_left_text = tokenized_left_text + ['[SEP]'] tokenized_right_text = bert_tokenizer.tokenize(right_text) if tokenized_right_text[0] == '[CLS]': tokenized_right_text = tokenized_right_text[1:] if tokenized_right_text[-1] == '[SEP]': tokenized_right_text = tokenized_right_text[0:-1] tokenized_text = tokenized_left_text + tokenized_right_text if len(tokenized_text) > MAX_SEQ_LENGTH: warnings.warn( "The text pair `{0}` - `{1}` contains too many sub-tokens!". format(left_text, right_text)) res.append((array.array("l"), 0)) else: token_IDs = bert_tokenizer.convert_tokens_to_ids(tokenized_text) res.append((array.array("l", token_IDs), len(tokenized_left_text))) del tokenized_left_text, tokenized_right_text, tokenized_text return res
def test_direct_keras_to_stock_compare(self): from tests.ext.modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint bert_config = BertConfig.from_json_file(self.bert_config_file) tokenizer = FullTokenizer( vocab_file=os.path.join(self.bert_ckpt_dir, "vocab.txt")) # prepare input max_seq_len = 6 input_str = "Hello, Bert!" input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) input_mask = [1] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) input_ids = np.array([input_ids], dtype=np.int32) input_mask = np.array([input_mask], dtype=np.int32) token_type_ids = np.array([token_type_ids], dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) s_res = self.predict_on_stock_model(input_ids, input_mask, token_type_ids) k_res = self.predict_on_keras_model(input_ids, input_mask, token_type_ids) np.set_printoptions(precision=9, threshold=20, linewidth=200, sign="+", floatmode="fixed") print("s_res", s_res.shape) print("k_res", k_res.shape) print("s_res:\n {}".format(s_res[0, :2, :10]), s_res.dtype) print("k_res:\n {}".format(k_res[0, :2, :10]), k_res.dtype) adiff = np.abs(s_res - k_res).flatten() print("diff:", np.max(adiff), np.argmax(adiff)) self.assertTrue(np.allclose(s_res, k_res, atol=1e-6))
dataset = dataset.shuffle(buffer_size=100) dataset = dataset.apply( tf.data.experimental.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=BATCH_SIZE, drop_remainder=drop_remainder)) return dataset tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) tokenizer.tokenize("I can't wait to visit Bulgaria again!") tokens = tokenizer.tokenize("I can't wait to visit Bulgaria again!") tokenizer.convert_tokens_to_ids(tokens) def flatten_layers(root_layer): if isinstance(root_layer, keras.layers.Layer): yield root_layer for layer in root_layer._layers: for sub_layer in flatten_layers(layer): yield sub_layer def freeze_bert_layers(l_bert): """ Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751. """ for layer in flatten_layers(l_bert):
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) print('Loading the model') model= load_model('saved_models/tensorflow_10000.h5', custom_objects={'BertModelLayer': BertModelLayer}) print('Model is loaded') print(model.summary()) test=pd.read_csv('articles-validation-bypublisher.csv') conlist=test['content'].tolist() idlist=test['id'].tolist() print('conlist', len(conlist),'idlist',len(idlist)) resList=[] for id, content in tqdm(zip(idlist, conlist)): pred_tokens = tokenizer.tokenize(content) pred_tokens = ["[CLS]"] + pred_tokens + ["[SEP]"] pred_token_ids = list(tokenizer.convert_tokens_to_ids(pred_tokens)) if len(pred_token_ids) >= 512: pred_token_ids = pred_token_ids[:512] else: pred_token_ids = pred_token_ids + [0] * (512 - len(pred_token_ids)) pred_token_ids = [pred_token_ids] pred_token_ids = np.array(pred_token_ids) predictions = model.predict(pred_token_ids).argmax(axis=-1) pred = 'true' if predictions[0] else 'false' resList.append(str(id) + ' ' + pred) print('Writing into the file') ResultFile=open('result.txt','w')
for input_ids in ids: input_ids = input_ids[:min(len(input_ids), self.max_seq_len -2)] input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids)) x.append(np.array(input_ids)) return np.array(x) tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, 'vocab.txt')) t = tokenizer.tokenize('ಶುಭ ದಿನ') print(t) ds = tokenizer.convert_tokens_to_ids(t) print(ds) def create_model(max_seq_len, bert_ckpt_file): with tf.io.gfile.GFile(bert_config_file, 'r') as reader: bc = StockBertConfig.from_json_string(reader.read()) bert_params = map_stock_config_to_params(bc) bert_params.adapter_size = None bert = BertModelLayer.from_params(bert_params, name='bert') input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name='input_ids') print("--------------->", input_ids) bert_output = bert(input_ids) print('----->bert shape', bert_output.shape)
return np.array(x), np.array(y) def _pad(self, ids): x = [] for input_ids in ids: input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)] input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids)) x.append(np.array(input_ids)) return np.array(x) tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) print(tokenizer.tokenize("I can't wait to visit Bulgaria again!")) tokens = tokenizer.tokenize("I can't wait to visit Bulgaria again!") print(tokenizer.convert_tokens_to_ids(tokens)) def create_model(max_seq_len, bert_ckpt_file): with tf.io.gfile.GFile(bert_config_file, "r") as reader: bc = StockBertConfig.from_json_string(reader.read()) bert_params = map_stock_config_to_params(bc) bert_params.adapter_size = None bert = BertModelLayer.from_params(bert_params, name="bert") input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids") bert_output = bert(input_ids) print("bert shape", bert_output.shape) cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)