def run(self): #set enviornment os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(self._gpuid) #load models #every worker only need to load model one time paths = get_checkpoint_paths(self._bert_checkpoint) model = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, output_layer_num=1, ) vocabs = load_vocabulary(paths.vocab) print('model init done', self._gpuid) while True: xfile = self._queue.get() if xfile == None: self._queue.put(None) break embeddings = extract_embeddings(model=model, vocabs=vocabs, texts=xfile[1], output_layer_num=1, poolings=[POOL_NSP, POOL_MAX]) print('woker running', self._gpuid, len(self.return_list)) self.return_list.append({ 'worker': self._gpuid, 'id': xfile[0], 'content': xfile[1], 'embeddings': embeddings }) print('worker predict done at gpu:', self._gpuid)
def download_pretrained_bert(language_backbone='chinese_wwm_base'): base_model_path = { 'multi_cased_base': PretrainedList.multi_cased_base, 'chinese_base': PretrainedList.chinese_base, 'wwm_uncased_large': PretrainedList.wwm_uncased_large, 'wwm_cased_large': PretrainedList.wwm_cased_large, 'chinese_wwm_base': 'https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'bert_base_cased': 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'bert_large_cased': 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'bert_base_uncased': 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'bert_large_uncased': 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip' } model_path = get_pretrained(base_model_path[language_backbone.lower()]) paths = get_checkpoint_paths(model_path) print(paths) return paths
def load_model(self): tf.keras.backend.clear_session() logging.info("Loading RuBERT model...") paths = get_checkpoint_paths("model_bert") inputs = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, seq_len=50) outputs = MaskedGlobalMaxPool1D(name="Pooling")(inputs.output) vocab = load_vocabulary(paths.vocab) return tf.keras.Model(inputs=inputs.inputs, outputs=outputs), vocab, Tokenizer(vocab)
def __init__(self, docs, vec): self.texts = np.array(docs) self.vec = vec paths = get_checkpoint_paths(".") inputs = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, seq_len=50) outputs = MaskedGlobalMaxPool1D(name='Pooling')(inputs.output) self.model = Model(inputs=inputs.inputs, outputs=outputs) self.vocab = load_vocabulary(paths.vocab) self.tokenizer = Tokenizer(self.vocab)
def __init__(self, config): model_path = config["model_path"] if not os.path.exists(model_path): model_dir = os.path.dirname(model_path) if not os.path.exists(model_dir): os.makedirs(model_dir) subprocess.run( f"wget -P {model_dir} {MODEL_URL} && cd {model_dir} && unzip chinese_wwm_L-12_H-768_A-12.zip", shell=True) paths = get_checkpoint_paths(model_path) self.model = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, output_layer_num=1) self.vocabs = load_vocabulary(paths.vocab)
def load(): global test_tables test_table_file = '../data/val.tables.json' bert_model_path = '../model' test_tables = read_tables(test_table_file) paths = get_checkpoint_paths(bert_model_path) global label_encoder label_encoder = SqlLabelEncoder() global query_tokenizer model, query_tokenizer = construct_model(paths) model_path = '../task1_best_model.h5' model.load_weights(model_path) global tokenizer model2, tokenizer = construct_model2(paths) model2.load_weights('../model_best_weights.h5') global models models = {} models['stage1'] = model models['stage2'] = model2 global graph graph = tf.get_default_graph()
# In[ ]: train_table_file = '../data/train/train.tables.json' train_data_file = '../data/train/train.json' val_table_file = '../data/val/val.tables.json' val_data_file = '../data/val/val.json' test_table_file = '../data/test/test.tables.json' test_data_file = '../data/test/test.json' # Download pretrained BERT model from https://github.com/ymcui/Chinese-BERT-wwm bert_model_path = '../model/chinese_wwm_L-12_H-768_A-12' paths = get_checkpoint_paths(bert_model_path) task1_file = '../submit/task1_output.json' # ## Read Data # In[ ]: train_tables = read_tables(train_table_file) train_data = read_data(train_data_file, train_tables) val_tables = read_tables(val_table_file) val_data = read_data(val_data_file, val_tables) test_tables = read_tables(test_table_file) test_data = read_data(test_data_file, test_tables)
import sys import numpy as np from keras_bert import load_vocabulary, load_trained_model_from_checkpoint, Tokenizer, get_checkpoint_paths print( 'This demo demonstrates how to load the pre-trained model and extract word embeddings' ) if len(sys.argv) == 2: model_path = sys.argv[1] else: from keras_bert.datasets import get_pretrained, PretrainedList model_path = get_pretrained(PretrainedList.chinese_base) paths = get_checkpoint_paths(model_path) model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=10) model.summary(line_length=120) token_dict = load_vocabulary(paths.vocab) tokenizer = Tokenizer(token_dict) text = '语言模型' tokens = tokenizer.tokenize(text) print('Tokens:', tokens) indices, segments = tokenizer.encode(first=text, max_len=10) predicts = model.predict([np.array([indices]), np.array([segments])])[0] for i, token in enumerate(tokens):
#!/usr/bin/python3 # import os # os.environ['TF_KERAS'] = '1' # import tensorflow as tf import keras from keras import backend as K from keras_bert import load_vocabulary, Tokenizer, get_checkpoint_paths, load_model_weights_from_checkpoint from keras_bert.layers import TokenEmbedding, PositionEmbedding import json from data_generator import load_data, convert_to_sample, DataGenerator import numpy as np from tqdm import tqdm pretrained_path = "/Users/weisu.yxd/Code/bert/chinese_L-12_H-768_A-12" # pretrained_path = "chinese_L-12_H-768_A-12" paths = get_checkpoint_paths(pretrained_path) token_dict = load_vocabulary(paths.vocab) mask_id = token_dict.get("[MASK]") tokenizer = Tokenizer(token_dict) id2token = {j: i for i, j in token_dict.items()} char_start_index = 670 char_end_index = 7991 def get_model_from_embedding(inputs, embed_layer, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1,
from nl2sql_model.utils import read_data, read_tables, SQL, MultiSentenceTokenizer, Query, Question, Table from nl2sql_model.utils.optimizer import RAdam # ---------------------------------加载数据--------------------------------- train_table_file = 'G:\\datas\\nl2sql\\TableQA-master\\train\\train.tables.json' train_data_file = 'G:\\datas\\nl2sql\\TableQA-master\\train\\train.json' val_table_file = 'G:\\datas\\nl2sql\\TableQA-master\\val\\val.tables.json' val_data_file = 'G:\\datas\\nl2sql\\TableQA-master\\val\\val.json' test_table_file = 'G:\\datas\\nl2sql\\TableQA-master\\test\\test.tables.json' test_data_file = 'G:\\datas\\nl2sql\\TableQA-master\\test\\test.json' # Download pretrained BERT model from https://github.com/ymcui/Chinese-BERT-wwm bert_model_path = 'G:\\datas\\nl2sql\\chinese_wwm_L-12_H-768_A-12' paths = get_checkpoint_paths(bert_model_path) # 该类作用是获得保存节点文件的状态 # ---------------------------------数据预处理--------------------------------- train_tables = read_tables(train_table_file) # 此处进行连表操作 train_data = read_data(train_data_file, train_tables) # # 将question/sql/table关联到一起 val_tables = read_tables(val_table_file) val_data = read_data(val_data_file, val_tables) test_tables = read_tables(test_table_file) test_data = read_data(test_data_file, test_tables) sample_query = train_data[2]
output = keras.layers.Conv1D(32,2,activation = 'tanh')(output) output = keras.layers.AveragePooling1D(2,strides=1)(output) output = keras.layers.Conv1D(64,3,activation = 'tanh')(output) output = keras.layers.AveragePooling1D(2,strides=1)(output) output = keras.layers.Conv1D(64,4,activation = 'tanh')(output) output = keras.layers.AveragePooling1D(4,strides=1)(output) output = keras.layers.Flatten()(output) output_y = keras.layers.Dense(count, activation='softmax')(output) #new softmax layer model = keras.Model(base_model.input, output_y) # summarize the model model.summary() return model checkpoint_paths = keras_bert.get_checkpoint_paths('./chinese_L-12_H-768_A-12') token_dict = keras_bert.loader.load_vocabulary(checkpoint_paths.vocab) tokenizer = keras_bert.tokenizer.Tokenizer(token_dict) # define documents max_labels = 0; x_tokens = [] x_segments = [] y = [] labels = [] with open('./datas/questions.json') as fp: loaded_json = json.load(fp) for doc in loaded_json: labels.append(doc['label']) for q in doc['questions']:
def load_bert_model(self, model_path): paths = get_checkpoint_paths(model_path) self.bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, training=False, seq_len=self.max_seq_len)
inputs = np.load('../data/coco_korean/coco_korean_tokens.npy') # inputs = tf.convert_to_tensor(tokens, dtype=tf.int32) print(inputs.shape) print(inputs[:2]) segments = np.ones_like(inputs) # model = BertModel(config, False, inputs) ckpt = '../bert_eojeol/' # model = tf.keras.Model() # checkpoint = tf.train.Checkpoint(model=model) # checkpoint.restore(tf.train.latest_checkpoint(ckpt)) # print(model.layers) paths = get_checkpoint_paths(ckpt) model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, training=False, seq_len=142) # model.summary() # model.save('koreanbert.h5') dataset = tf.data.Dataset.from_tensor_slices((inputs, segments)) dataset = dataset.batch(5000) outputs = np.empty((inputs.shape[0], 768)) i = 0 for data in dataset: inp = data[0] seg = data[1]