def parse_line(self, line, max_seq_len=512): """ parse one line to token_ids, sentence_ids, pos_ids, label """ line = line.strip().split(",") assert len(line) == 3, \ "One sample must have %d fields!" % 3 text_left, text_right, masklabel = line tokenizer = FullTokenizer(self.vocab_path) # tokenizer = FullTokenizer(vocab_path) text_left = tokenizer.tokenize(text_left) masklabel = tokenizer.tokenize(masklabel) masklabel_ = len(masklabel) * ["[MASK]"] text_right = tokenizer.tokenize(text_right) all_tokens = text_left + masklabel_ + text_right token_ids = tokenizer.convert_tokens_to_ids(all_tokens) sent_ids = [0] * len(all_tokens) pos_ids = [i for i in range(len(all_tokens))] input_mask = [1.0] * len(all_tokens) # 这儿还差一个mask_pos mask_pos = [] for idx, mask in enumerate(token_ids): if mask == self.mask_id: mask_pos.append(idx) # 添加一个mask_label mask_label = list(tokenizer.convert_tokens_to_ids(masklabel)) assert len(token_ids) == len(sent_ids) == len(pos_ids) == len( input_mask ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)" if len(token_ids) > max_seq_len: return None return [token_ids, sent_ids, pos_ids, input_mask, mask_pos, mask_label]
def preprocess(data): tokenizer = FullTokenizer(vocab_file) tok_ip = np.zeros((len(data), 128), dtype="int32") sent_ip = np.zeros((len(data), 128), dtype="int8") pos_ip = np.zeros((len(data), 128), dtype="int8") masks = np.zeros((len(data), 128), dtype="int8") for pos, text in tqdm.tqdm_notebook(enumerate(data)): tok0 = tokenizer.tokenize(text[0]) tok1 = tokenizer.tokenize(text[1]) tok = tok0 + tok1 if len(tok) > 128: tok = tok[:127] + ["[SEP]"] pad_len = 128 - len(tok) tok_len = len(tok) tok0_len = len(tok0) tok = tokenizer.convert_tokens_to_ids(tok) + [0] * pad_len pos_val = range(128) sent = [0] * tok0_len + [1] * (tok_len - tok0_len) + [0] * pad_len mask = [1] * tok_len + [0] * pad_len tok_ip[pos] = tok pos_ip[pos] = pos_val masks[pos] = mask masks = masks[:, None, None, :] return tok_ip, sent_ip, pos_ip, masks
def parse_sequence(tokenizer: t10n.FullTokenizer, sequence: str) -> SequenceParseResult: tokens = tokenizer.tokenize(sequence) tokens.insert(0, '[CLS]') tokens.append('[SEP]') # Could be 0 or 1, not sure which index is *supposed* to represent a first segment token_type_ids = [0] * len(tokens) input_ids = tokenizer.convert_tokens_to_ids(tokens) attention_mask = [1] * len(tokens) attention_mask[0] = 0 # Default for our model max_seq_length = 128 # Pad arrays while len(input_ids) < max_seq_length: # Not sure if padding belongs to the sequence or not token_type_ids.append(0) # Zero is the [PAD]-token for the BERT-vocab input_ids.append(0) # We probably should exclude the sequence padding from the attention-mask attention_mask.append(0) return SequenceParseResult( tokens=tokens, token_type_ids=token_type_ids, attention_mask=attention_mask, input_ids=input_ids, )
def convert_single_example(ex_index, example: InputExample, tag_list: list, label_list: list, max_seq_length, tokenizer: tokenization.FullTokenizer): query = tokenizer.tokenize(example.text) if len(query) > max_seq_length - 2: query = query[0:(max_seq_length - 2)] tokens = ["[CLS]"] tags = ["[CLS]"] for idx, token in enumerate(query): tokens.append(token) tags.append(example.tag[idx]) tokens.append("[SEP]") tags.append("[SEP]") segment_ids = [0] * len(tokens) tag_map = {} for idx, tag in enumerate(tag_list): tag_map[tag] = idx label_map = {} for idx, label in enumerate(label_list): label_map[label] = idx tag_ids = [tag_map[tag] for tag in tags] label_id = label_map[example.label] input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) tag_ids.append(0) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % example.guid) logger.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) logger.info("tag: %s" % " ".join(tags)) logger.info("label: %s" % example.label) feature = InputFeature( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, tag_ids=tag_ids, label_id=label_id ) return feature
class BERTTextEncoder(TextEncoder): def __init__(self, vocab_file: str, do_lower_case: bool = True) -> None: self.tokenizer = FullTokenizer(vocab_file, do_lower_case) super().__init__(len(self.tokenizer.vocab)) self.bert_unk_id = self.tokenizer.vocab['[UNK]'] self.bert_msk_id = self.tokenizer.vocab['[MASK]'] def standardize_ids(self, ids: List[int]) -> List[int]: for i in range(len(ids)): if ids[i] == self.bert_unk_id: # UNK ids[i] = 0 else: # VOCAB ids[i] -= self.bert_msk_id return ids def encode(self, sent: str) -> List[int]: return self.standardize_ids( self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(sent)))
do_lower_case=bert_layer.resolved_object.do_lower_case.numpy()) # TODO: # Document longer than 512 words wont be able to be encoded by BERT, # since its positional encoding has a hard limit for 512 words. # For better results we may need to summarize the document into <= 512 tokens, # or encode sentence by sentence then pool together. maxlen = 256 # TODO: # We need to manually handle CLS and SEP special token for sentence beginning and ending. # Encode text with padding, masking, and segmentation (required by BERT even if we don't use it). tok_seq_train = [bert_tokenizer.tokenize(text) for text in imdb_reviews_train] wid_seq_train = [ bert_tokenizer.convert_tokens_to_ids(toks)[:maxlen] for toks in tok_seq_train ] wid_seq_train_padded = pad_sequences(wid_seq_train, padding="post", maxlen=maxlen) wid_seq_train_mask = (wid_seq_train_padded > 0).astype(int) segment_ids_train = np.zeros_like(wid_seq_train_mask) tok_seq_test = [bert_tokenizer.tokenize(text) for text in imdb_reviews_test] wid_seq_test = [ bert_tokenizer.convert_tokens_to_ids(toks)[:maxlen] for toks in tok_seq_test ] wid_seq_test_padded = pad_sequences(wid_seq_test, padding="post",
class BertInference(object): """ The bert model. """ def __init__(self, bert_meta): self.graph = self._load_graph(bert_meta.model_file) self.tokenizer = FullTokenizer(vocab_file=bert_meta.vocab_file, do_lower_case=True) self.max_seq_length = 128 # Input. self.input_ids = self.graph.get_tensor_by_name('infer/input_ids:0') self.word_ids = self.graph.get_tensor_by_name('infer/input_mask:0') self.segment_ids = self.graph.get_tensor_by_name('infer/segment_ids:0') # Output. self.predictions = self.graph.get_tensor_by_name( 'infer/loss/Softmax:0') self.sess = tf.Session(graph=self.graph) self.inference(BertInputPackage(u'预热一下')) def inference(self, bert_input): """ Call model. """ input_ids, input_mask, segment_ids = self._convert_single_example( bert_input.query) preds_evaluated = self.sess.run(self.predictions, feed_dict={ self.input_ids: [input_ids], self.word_ids: [input_mask], self.segment_ids: [segment_ids] }) return preds_evaluated def _load_graph(self, frozen_graph_filename): with tf.gfile.GFile(frozen_graph_filename, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) with tf.Graph().as_default() as graph: tf.import_graph_def(graph_def, input_map=None, return_elements=None, name="infer", op_dict=None, producer_op_list=None) return graph def _convert_single_example(self, text_a): tokens_a = self.tokenizer.tokenize(text_a) if len(tokens_a) > self.max_seq_length - 2: tokens_a = tokens_a[0:(self.max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < self.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) return input_ids, input_mask, segment_ids
def main(): # pd_all = pd.read_csv(os.path.join(path, "weibo_senti_100k.csv")) # pd_all = shuffle(pd_all) # x_data, y_data = pd_all.review.values, pd_all.label.values # x_data = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for text in x_data] # x_train, x_test, y_train, y_test = train_test_split(np.array(x_data), y_data, test_size=0.2) #(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) tokenizer = FullTokenizer("vocab.txt") print('Loading data...') # 读取训练数据 train_data = pd.read_csv(os.path.join(path, "train.csv")) # 读取验证数据 dev_data = pd.read_csv(os.path.join(path, "dev.csv")) # 读取测试数据 test_data = pd.read_csv(os.path.join(path, "test.csv")) x_train, y_train = train_data.review.values, train_data.label.values x_dev, y_dev = dev_data.review.values, dev_data.label.values x_test, y_test = test_data.review.values, test_data.label.values # tokenize to ids x_train = [ tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for text in x_train ] x_dev = [ tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for text in x_dev ] x_test = [ tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for text in x_test ] max_features = 21128 # cut texts after this number of words (among top max_features most common words) maxlen = 128 batch_size = 32 print(len(x_train), 'train sequences') print(len(x_dev), 'dev sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen) x_dev = keras.preprocessing.sequence.pad_sequences(x_dev, maxlen=maxlen) x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_dev shape:', x_dev.shape) print('x_test shape:', x_test.shape) print('Build model...') model = keras.models.Sequential() model.add(keras.layers.Embedding(max_features, 200)) model.add(keras.layers.LSTM(300, dropout=0.2, recurrent_dropout=0.2)) model.add(keras.layers.Dense(1, activation='sigmoid')) # try using different optimizers and different optimizer configs # metrics 设置方式一 使用keras内部函数或者自定义函数名 # model.compile(loss='binary_crossentropy',optimizer='adam' # ,metrics=['accuracy',metric_precision,metric_recall,metric_F1score]) # metrics 设置方式二 使用metrics对象中的函数实例对象 在tensorflow.keras.metrics中才有。 metrics = keras.metrics model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', metrics.Precision(), metrics.Recall()]) print('Train...') model.fit(x_train, y_train, batch_size=batch_size, epochs=15, verbose=1, validation_data=(x_dev, y_dev)) # 统计测试数据集的准确率的方式一 y_predicts = model.predict(x_test, batch_size=batch_size, verbose=1) # print('y_predicts.shape:', y_predicts.shape) print('y_predicts:', y_predicts) # 判断预测结果中每行是否大于一列,如果大于一列,每个样本的预测类别,就取概率值最大的列索引对应的类别 if y_predicts.shape[-1] > 1: print('if true') y_predicts = y_predicts.argmax(axis=-1).tolist() else: print('if false') y_predicts = (y_predicts > 0.5).astype('int32').tolist() right_num = 0 total = len(y_test) for i in range(total): if y_predicts[i][0] == y_test[i]: right_num += 1 result = 'Test accuracy:%.2f' % (right_num * 100 / total) # 统计测试数据集的准确率的方式二 该方式就是直接使用keras模型实例中的评估方法去评估测试数据集即可 evaluate = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) result += '\n=========================================\n' + 'loss,accuracy,precision,recall,f1-score:' + str( evaluate) # 方式三 使用scikit-learn 中的classification_report方法 计算p,r,f1 y_predict = model.predict_classes(x_test, batch_size=batch_size, verbose=1) report = classification_report(y_test, y_predict, digits=4) result += '\n=========================================\n' + report print(result) with open(output_path + 'train_lstm_result.txt', 'w', encoding='utf-8') as f: f.write(result) # 保存网络模型 model.save(output_path + 'weibo_lstm_model.h5') print('模型保存成功')
class ApiModel: def __init__(self): self.THRESHOLD = 0.1 self.PROB_THRESHOLD = 0.8 self.LABELS_32 = [ "sentimental", "afraid", "proud", "faithful", "terrified", "joyful", "angry", "sad", "jealous", "grateful", "prepared", "embarrassed", "excited", "annoyed", "lonely", "ashamed", "guilty", "surprised", "nostalgic", "confident", "furious", "disappointed", "caring", "trusting", "disgusted", "anticipating", "anxious", "hopeful", "content", "impressed", "apprehensive", "devastated" ] self.MAX_SEQ_LENGTH = 50 self.tokenizer = FullTokenizer( vocab_file='vocab.txt', do_lower_case=True) self.model = load_model('model_data/model32') self.matrix = np.genfromtxt('emotion_multiplier.csv') self.map_probabilities = np.vectorize(lambda x: 1 if x >= self.PROB_THRESHOLD else 0) def predict(self, text: str): input_ids, input_mask, segment_ids, label_ids = self._convert_single_example( text) features: str = self._serialize_features( input_ids, input_mask, segment_ids, label_ids) probabilities = self.model({'examples': [features]})[ "probabilities"][0] # excluded_emotions = ['nostalgic', 'sentimental', 'prepared', 'anticipating'] # emotions = [k for k,v in zip(self.LABELS_32, probabilities) if (v>self.PROB_THRESHOLD) and (k not in excluded_emotions)] # recheck # if len(emotions) == 0: # emotions = ['neutral'] animations = list(np.matmul(self.matrix, self.map_probabilities(probabilities))) top_probabilities = [(k, v) for k, v in zip(self.LABELS_32, probabilities) if v >= self.THRESHOLD] top_emotions = dict(sorted(top_probabilities, key=lambda x: -x[1])) return {'emotions': top_emotions, 'animations': animations} def _convert_single_example(self, text): """Modified from goemotions/bert_classifier.py""" tokens = self.tokenizer.tokenize(text) if len(tokens) > self.MAX_SEQ_LENGTH - 2: tokens = tokens[0:(self.MAX_SEQ_LENGTH - 2)] tokens = ["[CLS]"] + tokens + ["[SEP]"] segment_ids = [0] * len(tokens) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < self.MAX_SEQ_LENGTH: input_ids.append(0) input_mask.append(0) segment_ids.append(0) return input_ids, input_mask, segment_ids, [0] * len(self.LABELS_32) def _serialize_features(self, input_ids, input_mask, segment_ids, label_ids): features = { "input_ids": self._create_int_feature(input_ids), "input_mask": self._create_int_feature(input_mask), "segment_ids": self._create_int_feature(segment_ids), "label_ids": self._create_int_feature(label_ids) } tf_example = Example(features=Features(feature=features)) return tf_example.SerializeToString() def _create_int_feature(self, values): return Feature(int64_list=Int64List(value=list(values)))
class BERTFunction(object): def __init__(self, bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels, use_gpu=False): # 导入预训练参数所需 self.bert_config = modeling.BertConfig.from_json_file(bert_config_file) self.init_checkpoint = init_checkpoint # 数据集和计算所需 self.max_seq_length = max_seq_length #使用了几何级数方式的position embedding,最大长度为512,这时固定在计算图中的,而具体计算的长度可以自己给定 self.num_labels = num_labels # 数据预处理所需 self.vocab_file = vocab_file self.tokenizer = FullTokenizer(self.vocab_file, do_lower_case=False) # 默认 cased 模型 # gpu self.use_gpu = use_gpu self.graph = tf.Graph() #声明计算图 with self.graph.as_default(): # 定义placeholder self.input_ids = tf.placeholder(dtype=tf.int64, shape=(None, self.max_seq_length)) self.input_mask = tf.placeholder(dtype=tf.int64, shape=(None, self.max_seq_length)) self.segment_ids = tf.placeholder(dtype=tf.int64, shape=(None, self.max_seq_length)) # 定义计算 (self.logits, self.probabilities) = create_predict_model( self.bert_config, self.input_ids, self.input_mask, self.segment_ids, self.num_labels) # 导入预训练参数 self.tvars = tf.trainable_variables() #创建了计算图后,可训练的变量随之被创建。 self.initialized_variable_names = {} if self.init_checkpoint: #init_checkpoint是命令行中传入的预训练BERT或先前训练过的,ckpt文件 ( self.assignment_map, self. initialized_variable_names #从init_checkpoints中获取与可用的变量的值(预训练模型与实际任务计算图的变量的交集) ) = modeling.get_assignment_map_from_checkpoint( self.tvars, self.init_checkpoint) tf.train.init_from_checkpoint( self.init_checkpoint, self.assignment_map) #需要包含在graph中,但是这样它是什么时候运行的呢? init = tf.global_variables_initializer( ) #理论上,如果完整地恢复了模型,已经不需要再初始化了 #创建会话,这个会话将随对象一直存在,保持初始导入的参数;并用于新输入数据的计算 config = None if self.use_gpu: config = tf.ConfigProto(log_device_placement=True) config.gpu_options.allow_growth = True print("trying to use gpu") else: print("using cpu") self.sess = tf.Session(graph=self.graph, config=config) self.sess.run(init) def print_tvars_names(self): print(self.tvars) def print_tvar_value(self, i): #获得trainable_variables列表中下标为i的tvar try: print(self.sess.run(self.tvars[i])) except: print("can't get it, may be the index is out of range.") def cal(self, features): feed_dict = { self.input_ids: features["input_ids"], self.input_mask: features["input_mask"], self.segment_ids: features["segment_ids"] } prob = self.sess.run(self.probabilities, feed_dict=feed_dict) print("prob: \n", prob) return prob def process_batch_input(self, text_as, text_bs): #重新精简 convert_single_example 函数 input_idss = [] input_masks = [] segment_idss = [] for text_a, text_b in zip(text_as, text_bs): print(text_a) print(text_b) tokens_a = self.tokenizer.tokenize(text_a) tokens_b = self.tokenizer.tokenize(text_b) #_truncate_seq_pair 函数 while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= self.max_seq_length - 3: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() # 精简 convert_single_example 函数 tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < self.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.max_seq_length assert len(input_mask) == self.max_seq_length assert len(segment_ids) == self.max_seq_length print("tokens: %s" % " ".join([printable_text(x) for x in tokens])) input_idss.append(input_ids) input_masks.append(input_mask) segment_idss.append(segment_ids) # print("input_ids: %s" % " ".join([str(x) for x in input_ids])) # print("input_mask: %s" % " ".join([str(x) for x in input_mask])) # print("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) return { "input_ids": nparray(input_idss, dtype=npint64), "input_mask": nparray(input_masks, dtype=npint64), "segment_ids": nparray(segment_idss, dtype=npint64) } def batch_cal(self, text_as, text_bs): features = self.process_batch_input(text_as, text_bs) return self.cal(features) def process_context_list_and_candidates(self, context_list, candidates): #将 max_seq_length 一分为二,分别存放 text_a 和 text_b,以保持平衡; #如果 text_b,即候选的回复长度较小,则将剩余空间都赋给 text_a,即上文 input_idss = [] input_masks = [] segment_idss = [] for cdd in candidates: t_c = self.tokenizer.tokenize(cdd) length = len(t_c) + 2 #'[CLS]', '[SEP]' t_us = [] tokens = [] for utterance in context_list[-1::-1]: t_u = self.tokenizer.tokenize(utterance) length += len(t_u) + 1 while length > self.max_seq_length: if len(t_c) + 1 > self.max_seq_length / 2: t_c.pop() length -= 1 else: t_u.pop() length -= 1 t_u.append('[SEP]') t_us = t_u + t_us if length == self.max_seq_length and len( t_c) + 1 <= self.max_seq_length / 2: break tokens.append('[CLS]') tokens.extend(t_us) tokens.extend(t_c) tokens.append('[SEP]') input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) segment_ids = [1] * (len(t_us) + 1) + ([0] * (len(t_c) + 1)) while len(input_ids) < self.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.max_seq_length assert len(input_mask) == self.max_seq_length assert len(segment_ids) == self.max_seq_length print("tokens: %s" % " ".join([printable_text(x) for x in tokens])) print("length:" + str(len(tokens))) input_idss.append(input_ids) input_masks.append(input_mask) segment_idss.append(segment_ids) return { "input_ids": nparray(input_idss, dtype=npint64), "input_mask": nparray(input_masks, dtype=npint64), "segment_ids": nparray(segment_idss, dtype=npint64) } def context_list_and_candidates_cal(self, context_list, candidates): features = self.process_context_list_and_candidates( context_list, candidates) return self.cal(features) #用法示例 #############!!!!!!!!!!!!!需要确定max_seq_length 在模型中,对张量流动的限制,否则由 transformer 的特性,完全可以更自由 # import os # pretrained_dir="./pretrained/multi_cased_L-12_H-768_A-12/" # init_checkpoint = os.path.join(pretrained_dir, "./bert_model.ckpt") # bert_config_file=os.path.join(pretrained_dir, "./bert_config.json") # vocab_file = os.path.join(pretrained_dir, "./vocab.txt") # max_seq_length =160 # num_labels = 2 # # func=BERTFunction(bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels) # res=func.cal("I'm gogo, who are you?","I'm Trump, I'm fine.")
def create_pretraining_data_from_docs(docs, save_path, vocab_path, token_method='wordpiece', language='en', max_seq_length=128, dupe_factor=10, short_seq_prob=0.1, masked_lm_prob=0.15, max_predictons_per_seq=20): """docs: sequence of sequence of sentences. Args: docs: Sequence of sequence. Docs is a sequence of documents. A document is a sequence of sentences. save_path: path to save pretraining data. vocab_path: The vocabulary file that the BERT model was trained on. only enable when token_method='wordpiece'. token_method: string. 'wordpiece' or 'spacy' language: string. 'en' or 'chn' max_seq_length: integer. Maximum sequence length. dupe_factor: integer. Number of times to duplicate the input data (with different masks). short_seq_prob: float. Probability of creating sequences which are shorter than the maximum length. masked_lm_prob: float. Masked LM probability. max_predictons_per_seq: integer. Maximum number of masked LM predictions per sequence. """ if not hasattr(docs, '__len__'): raise ValueError("`docs` should be sequence of sequence.") else: if not hasattr(docs[0], '__len__'): raise ValueError("`docs` should be sequence of sequence.") if token_method not in ['wordpiece', 'spacy']: raise ValueError( "`token_method` must be one of `wordpiece` and `spacy`.") if language not in ['en', 'chn']: raise ValueError("`language` should be one of `en` and `chn`.") if token_method == "spacy" and language == "chn": raise ValueError( "spacy tokenizer only enable when `language` is `en`.") if token_method == "wordpiece": tokenizer = FullTokenizer(vocab_path, do_lower_case=True) else: tokenizer = SpacyTokenizer(vocab_path, do_lower_case=True) instances = create_training_instances( docs, tokenizer=tokenizer, max_seq_length=max_seq_length, dupe_factor=dupe_factor, short_seq_prob=short_seq_prob, masked_lm_prob=masked_lm_prob, max_predictions_per_seq=max_predictons_per_seq) pretraining_data = dict(tokens=[], segment_ids=[], is_random_next=[], masked_lm_positions=[], masked_lm_labels=[]) for i, instance in enumerate(instances): if i < 10: print("num-{}: {}".format(i, instance)) pretraining_data['tokens'].append(instance.tokens) pretraining_data['segment_ids'].append(instance.segment_ids) pretraining_data['is_random_next'].append(int(instance.is_random_next)) pretraining_data['masked_lm_positions'].append( instance.masked_lm_positions) pretraining_data['masked_lm_labels'].append(instance.masked_lm_labels) tokens_ids = [] tokens_mask = [] for tokens in pretraining_data['tokens']: sub_ids = tokenizer.convert_tokens_to_ids(tokens) sub_mask = [1] * len(sub_ids) tokens_ids.append(sub_ids) tokens_mask.append(sub_mask) masked_lm_ids = [] for mask_labels in pretraining_data['masked_lm_labels']: sub_masked_lm_ids = tokenizer.convert_tokens_to_ids(mask_labels) masked_lm_ids.append(sub_masked_lm_ids) # input tokens_ids = pad_sequences(tokens_ids, maxlen=128, padding='post', truncating='post') tokens_mask = pad_sequences(tokens_mask, maxlen=128, padding='post', truncating='post') segment_ids = pad_sequences(pretraining_data['segment_ids'], maxlen=128, padding='post', truncating='post') masked_lm_positions = pad_sequences( pretraining_data['masked_lm_positions'], maxlen=20, padding='post', truncating='post') # label is_random_next = to_categorical(pretraining_data['is_random_next'], num_classes=2) masked_lm_labels = pad_sequences(masked_lm_ids, maxlen=20, padding='post', truncating='post') # save np.savez(file=save_path, tokens_ids=tokens_ids, tokens_mask=tokens_mask, segment_ids=segment_ids, is_random_next=is_random_next, masked_lm_positions=masked_lm_positions, masked_lm_labels=masked_lm_labels) print("[INFO] number of train data:", len(tokens_ids)) print("[INFO] is_random_next ratio:", np.sum(pretraining_data['is_random_next']) / len(is_random_next))
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.utils import shuffle from tokenization import FullTokenizer path = "./" pd_all = pd.read_csv(os.path.join(path, "weibo_senti_100k.csv")) tokenizer = FullTokenizer("vocab.txt") pd_all = shuffle(pd_all) x_data, y_data = pd_all.review.values, pd_all.label.values x_data = [ tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for text in x_data ] x_train, x_test, y_train, y_test = train_test_split(np.array(x_data), y_data, test_size=0.2) max_features = 21128 # cut texts after this number of words (among top max_features most common words) maxlen = 128 batch_size = 32 print('Loading data...') #(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
class ModelServer: def __init__(self, param): self.model_path = os.path.abspath(param["model_path"]) self.bert_config_file = os.path.abspath(param["bert_config_file"]) bert_config = modeling.BertConfig.from_json_file(self.bert_config_file) self.fulltoken = FullTokenizer(os.path.abspath(param["vocab_file"])) self.vocab_dict = self.fulltoken.vocab target_start_ids = self.vocab_dict["[CLS]"] target_end_ids = self.vocab_dict["[SEP]"] num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(',')) tf.logging.info("num_gpus is {}".format(num_gpus)) if param["use_mul_gpu"]: distribute = tf.contrib.distribute.MirroredStrategy( num_gpus=num_gpus) else: distribute = None run_config = tf.estimator.RunConfig(model_dir=os.path.abspath( self.model_path), save_summary_steps=200, keep_checkpoint_max=2, save_checkpoints_steps=3000, train_distribute=distribute, eval_distribute=distribute) self.input_max_seq_length = param["max_seq_length"] model_fn = model_fn_builder( bert_config, init_checkpoint=None, learning_rate=0.0001, num_train_steps=10000, num_warmup_steps=100, use_one_hot_embeddings=False, # when use tpu ,it's True input_seq_length=param["max_seq_length"], target_seq_length=param["max_target_seq_length"], target_start_ids=target_start_ids, target_end_ids=target_end_ids, batch_size=param["batch_size"], mode_type=param["mode_type"]) self.estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) #input:[(str_mask_tokens,str_labels),list_str_mask_words] #label 0:Not mentioned, # 1:Negative, # 2:Neutral, # 3:Positive def predict(self, inputs, limitNum=3): predicts = [] if not isinstance(inputs, list): inputs = [inputs] def token_input(): for input in inputs: tokens = input[0] labels = [int(label) for label in input[1]][:20] mask_words = input[2] assert max(labels) < 4 and min(labels) >= 0 tokens = self.fulltoken.tokenize( tokens)[:self.input_max_seq_length - 2] def replace_Mask(tokens, mask_words): mask_index = [] first_maskwords = [x[0] for x in mask_words] for index, token in enumerate(tokens): if token in first_maskwords: for mask_words_x in mask_words: if token == mask_words_x[0]: _token = "".join([ _t.replace("#", '') for _t in tokens[index:index + len(mask_words_x)] ]) if _token == mask_words_x: for i in range(len(mask_words_x)): mask_index.append(index + i) mask_words = [ x_ for x_ in mask_words if x_ != mask_words_x ] first_maskwords = [ x[0] for x in mask_words ] if len(mask_words) < 1: break for mask_index_ in mask_index: tokens[mask_index_] = '[MASK]' return tokens tokens = replace_Mask(tokens, mask_words) ids = self.fulltoken.convert_tokens_to_ids(['[CLS]'] + tokens + ['[SEP]']) input_mask = [1] * len(ids) segment_ids = [0] * self.input_max_seq_length while len(ids) < self.input_max_seq_length: ids.append(0) input_mask.append(0) while len(labels) < 20: labels.append(0) yield ([ids], [input_mask], [labels], [segment_ids]) def input_fn(): dataset = tf.data.Dataset.from_generator( token_input, (tf.int64, tf.int64, tf.int64, tf.int64), output_shapes=(tf.TensorShape([ None, self.input_max_seq_length ]), tf.TensorShape([None, self.input_max_seq_length]), tf.TensorShape([None, 20]), tf.TensorShape( [None, self.input_max_seq_length]))) dataset = dataset.map( lambda ids, input_mask, labels, segment_ids: { "sentiment_labels": labels, "input_token_ids": ids, "input_mask": input_mask, "target_token_ids": tf.zeros_like([1, 1]), "target_mask": tf.zeros_like([1, 1]), "segment_ids": segment_ids }) # (ids, input_mask, labels, segment_ids)=dataset # features={ # "sentiment_labels": labels, # "input_token_ids": ids, # "input_mask": input_mask, # "target_token_ids": tf.zeros_like([1, 1]), # "target_mask": tf.zeros_like([1, 1]), # "segment_ids": segment_ids} # # return features return dataset result = self.estimator.predict(input_fn=input_fn) for prediction in result: sample_id = prediction['sample_id'][:, :limitNum].T.tolist() ans = [] for sample_id_ in sample_id: token = self.fulltoken.convert_ids_to_tokens(sample_id_) ans.append("".join(token[:-1])) predicts.append(ans) input = prediction['inputs'].tolist() print(self.fulltoken.convert_ids_to_tokens(input)) return predicts
class BERTFunction(object): def __init__(self, bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels, use_gpu=False): # 导入预训练参数所需 self.bert_config = modeling.BertConfig.from_json_file(bert_config_file) self.init_checkpoint = init_checkpoint # 数据集和计算所需 self.max_seq_length = max_seq_length self.num_labels = num_labels # 数据预处理所需 self.vocab_file = vocab_file self.tokenizer=FullTokenizer(self.vocab_file, do_lower_case=False) # 默认 cased 模型 # gpu self.use_gpu=use_gpu self.graph=tf.Graph() #声明计算图 with self.graph.as_default(): # 定义placeholder self.input_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length)) self.input_mask = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length)) self.segment_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length)) # 定义计算 (self.logits, self.probabilities) = create_predict_model(self.bert_config, self.input_ids, self.input_mask, self.segment_ids, self.num_labels) # 导入预训练参数 self.tvars = tf.trainable_variables() #创建了计算图后,可训练的变量随之被创建。 self.initialized_variable_names = {} if self.init_checkpoint: #init_checkpoint是命令行中传入的预训练BERT或先前训练过的,ckpt文件 (self.assignment_map, self.initialized_variable_names #从init_checkpoints中获取与可用的变量的值(预训练模型与实际任务计算图的变量的交集) ) = modeling.get_assignment_map_from_checkpoint(self.tvars, self.init_checkpoint) tf.train.init_from_checkpoint(self.init_checkpoint, self.assignment_map) def cal(self, text_a, text_b): features=self.process_input(text_a, text_b) config = None if self.use_gpu: config = tf.ConfigProto(log_device_placement=True) config.gpu_options.allow_growth = True print("trying to use gpu") else: print("not using cpu") with tf.Session(graph=self.graph, config=config) as session: session.run(tf.global_variables_initializer()) feed_dict={self.input_ids: features["input_ids"], self.input_mask: features["input_mask"], self.segment_ids: features["segment_ids"]} prob = session.run(self.probabilities, feed_dict=feed_dict) print("prob: \n", prob) return prob def process_input(self, text_a, text_b):#重新精简 convert_single_example 函数 tokens_a = self.tokenizer.tokenize(text_a) tokens_b = self.tokenizer.tokenize(text_b) #_truncate_seq_pair 函数 while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= self.max_seq_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() # 精简 convert_single_example 函数 tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < self.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.max_seq_length assert len(input_mask) == self.max_seq_length assert len(segment_ids) == self.max_seq_length print("tokens: %s" % " ".join([printable_text(x) for x in tokens])) # print("input_ids: %s" % " ".join([str(x) for x in input_ids])) # print("input_mask: %s" % " ".join([str(x) for x in input_mask])) # print("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) return {"input_ids": nparray([input_ids], dtype=npint64), "input_mask": nparray([input_mask], dtype=npint64), "segment_ids": nparray([segment_ids], dtype=npint64)} #用法示例 #############!!!!!!!!!!!!!需要确定max_seq_length 在模型中,对张量流动的限制,否则由 transformer 的特性,完全可以更自由 # import os # pretrained_dir="../pretrained/multi_cased_L-12_H-768_A-12/" # init_checkpoint = os.path.join(pretrained_dir, "./bert_model.ckpt") # bert_config_file=os.path.join(pretrained_dir, "./bert_config.json") # vocab_file = os.path.join(pretrained_dir, "./vocab.txt") # max_seq_length =160 # num_labels = 2 # # func=BERTFunction(bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels) # res=func.cal("I'm gogo, who are you?","I'm Trump, I'm fine.")