def run(self): #set enviornment os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(self._gpuid) #load models #every worker only need to load model one time paths = get_checkpoint_paths(self._bert_checkpoint) model = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, output_layer_num=1, ) vocabs = load_vocabulary(paths.vocab) print('model init done', self._gpuid) while True: xfile = self._queue.get() if xfile == None: self._queue.put(None) break embeddings = extract_embeddings(model=model, vocabs=vocabs, texts=xfile[1], output_layer_num=1, poolings=[POOL_NSP, POOL_MAX]) print('woker running', self._gpuid, len(self.return_list)) self.return_list.append({ 'worker': self._gpuid, 'id': xfile[0], 'content': xfile[1], 'embeddings': embeddings }) print('worker predict done at gpu:', self._gpuid)
def construct_model2(paths, use_multi_gpus=True): token_dict = load_vocabulary(paths.vocab) tokenizer = SimpleTokenizer(token_dict) bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=None) for l in bert_model.layers: l.trainable = True x1_in = Input(shape=(None, ), name='input_x1', dtype='int32') x2_in = Input(shape=(None, ), name='input_x2') x = bert_model([x1_in, x2_in]) x_cls = Lambda(lambda x: x[:, 0])(x) y_pred = Dense(1, activation='sigmoid', name='output_similarity')(x_cls) model = Model([x1_in, x2_in], y_pred) if use_multi_gpus: print('using multi-gpus') model = multi_gpu_model(model, gpus=2) model.compile(loss={'output_similarity': 'binary_crossentropy'}, optimizer=Adam(1e-5), metrics={'output_similarity': 'accuracy'}) return model, tokenizer
def init_bert(self,config): bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.load_bert(bert_config, bert_checkpoint)
def construct_model(paths, use_multi_gpus=False): token_dict = load_vocabulary(paths.vocab) tokenizer = SimpleTokenizer(token_dict) bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=None) for l in bert_model.layers: l.trainable = True # x1是QuestionCondPair的question字段和cond_text字段的拼接。x2是拼接的segment_ids。x2的长度和x1一样。在x2中,对应于x1中question的位置为0,对应于x1中cond_text的位置为1 # (question是查询文本,cond_text是拼凑出来的查询条件的文本形式,如“影片名称是密室逃生”) # y是“cond_text是question中包含的查询条件“的概率 # x1、x2、y都在QuestionCondPairsDataseq类的__getitem__方法中构造 x1_in = Input(shape=(None, ), name='input_x1', dtype='int32') x2_in = Input(shape=(None, ), name='input_x2') x = bert_model([x1_in, x2_in]) x_cls = Lambda(lambda x: x[:, 0])(x) # 取bert输出序列的第1个元素 y_pred = Dense(1, activation='sigmoid', name='output_similarity')(x_cls) model = Model([x1_in, x2_in], y_pred) if use_multi_gpus: print('using multi-gpus') model = multi_gpu_model(model, gpus=2) model.compile(loss={'output_similarity': 'binary_crossentropy'}, optimizer=Adam(1e-5), metrics={'output_similarity': 'accuracy'}) return model, tokenizer
def load_task2_testX(dict_path, data_dir): if not os.path.exists(os.path.join( data_dir, 'task2_testX.npy')) or not os.path.exists( os.path.join(data_dir, 'task2_test_seg.npy')): df = pd.read_csv(os.path.join(data_dir, 'task2_public_testset.csv'), dtype=str) abstract = df.values[:, 2] # collect words token_dict = load_vocabulary(dict_path) tokenizer = Tokenizer(token_dict) input_data = [] input_seg = [] seq_len = 512 # maximum should be 638, while bert-BASE only support up to 512 for i in tqdm(abstract): j = i.replace('$$$', ' ') idx, seg = tokenizer.encode(j, max_len=seq_len) input_data.append(idx) input_seg.append(seg) X = np.asarray(input_data) seg = np.asarray(input_seg) np.save(os.path.join(data_dir, 'task2_testX.npy'), X) np.save(os.path.join(data_dir, 'task2_test_seg.npy'), seg) else: X, seg = np.load(os.path.join(data_dir, 'task2_testX.npy')), np.load( os.path.join(data_dir, 'task2_test_seg.npy')) return X, seg
def init_all(self, config): if self.train: bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.init_bert(bert_config, bert_checkpoint) self.get_sentence(config['train_list'] if self.train else config['eval_list'], training=self.train)
def tokenizer_init(self): pretrained_path = 'uncased_L-12_H-768_A-12' config_path = os.path.join(pretrained_path, 'bert_config.json') model_path = os.path.join(pretrained_path, 'bert_model.ckpt') vocab_path = os.path.join(pretrained_path, 'vocab.txt') token_dict = load_vocabulary(vocab_path) print("Total vocabulary loaded: {}".format(len(token_dict))) self.tokenizer = Tokenizer(token_dict)
def load_model(self): tf.keras.backend.clear_session() logging.info("Loading RuBERT model...") paths = get_checkpoint_paths("model_bert") inputs = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, seq_len=50) outputs = MaskedGlobalMaxPool1D(name="Pooling")(inputs.output) vocab = load_vocabulary(paths.vocab) return tf.keras.Model(inputs=inputs.inputs, outputs=outputs), vocab, Tokenizer(vocab)
def load_pretrained(options): model = load_trained_model_from_checkpoint( options.bert_config_file, options.init_checkpoint, training=False, trainable=True, seq_len=options.max_seq_length, ) vocab = load_vocabulary(options.vocab_file) print('vocab size', len(vocab)) return model, vocab
def pretrain_model(): df = pd.read_csv('../data/task2_trainset.csv', dtype=str) df_2 = pd.read_csv('../data/task2_public_testset.csv', dtype=str) abstract_1 = df.values[:, 2] abstract_2 = df_2.values[:, 2] token_dict = load_vocabulary(dict_path) token_list = list(token_dict.keys()) tokenizer = Tokenizer(token_dict) X_1 = collect_inputs(abstract_1, tokenizer) X_2 = collect_inputs(abstract_2, tokenizer) X = X_1 + X_2 print(len(X)) model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True, trainable=True, seq_len=512) compile_model(model) def _generator(): while True: yield gen_batch_inputs(generate_input_by_batch(X), token_dict, token_list, seq_len=512, mask_rate=0.3) opt_filepath = sys.argv[1] checkpoint = ModelCheckpoint(opt_filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=1, mode='auto', min_delta=0.1, cooldown=10, min_lr=1e-10) es = EarlyStopping(monitor='val_loss', patience=50) callbacks_list = [checkpoint, es, reduce_lr] model.fit_generator(generator=_generator(), steps_per_epoch=500, epochs=5000, validation_data=_generator(), validation_steps=200, callbacks=callbacks_list)
def __init__(self, docs, vec): self.texts = np.array(docs) self.vec = vec paths = get_checkpoint_paths(".") inputs = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, seq_len=50) outputs = MaskedGlobalMaxPool1D(name='Pooling')(inputs.output) self.model = Model(inputs=inputs.inputs, outputs=outputs) self.vocab = load_vocabulary(paths.vocab) self.tokenizer = Tokenizer(self.vocab)
def __init__(self): import keras import keras_bert from driver_amount import addh bert_model, bert_model_config = keras_bert.build_model_from_config( addh + config.BERT_CONFIG_PATH, trainable=False ) output = bert_model.get_layer("Embedding-Norm").output self.model = keras.models.Model(bert_model.input, output) self.model.load_weights(addh + config.MODEL_PATH, by_name=True) self.vocab = keras_bert.load_vocabulary(addh + config.BERT_VOCAB_PATH)
def __init__(self, config): model_path = config["model_path"] if not os.path.exists(model_path): model_dir = os.path.dirname(model_path) if not os.path.exists(model_dir): os.makedirs(model_dir) subprocess.run( f"wget -P {model_dir} {MODEL_URL} && cd {model_dir} && unzip chinese_wwm_L-12_H-768_A-12.zip", shell=True) paths = get_checkpoint_paths(model_path) self.model = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, output_layer_num=1) self.vocabs = load_vocabulary(paths.vocab)
def load_task2_trainXY(dict_path, data_dir): if not os.path.exists(os.path.join( data_dir, 'task2_trainX.npy')) or not os.path.exists( os.path.join( data_dir, 'task2_trainY.npy')) or not os.path.exists( os.path.join(data_dir, 'task2_train_seg.npy')): df = pd.read_csv(os.path.join(data_dir, 'task2_trainset.csv'), dtype=str) cate = df.values[:, -1] # generating Y Y = np.zeros((cate.shape[0], 4)) name = { 'THEORETICAL': 0, 'ENGINEERING': 1, 'EMPIRICAL': 2, 'OTHERS': 3 } for i in range(cate.shape[0]): for c in cate[i].split(' '): Y[i, name[c]] += 1 # generating X abstract = df.values[:, 2] # collect words token_dict = load_vocabulary(dict_path) tokenizer = Tokenizer(token_dict) input_data = [] input_seg = [] for i in tqdm(abstract): j = i.replace('$$$', ' ') idx, seg = tokenizer.encode(j, max_len=512) input_data.append(idx) input_seg.append(seg) X = np.array(input_data) seg = np.array(input_seg) np.save(os.path.join(data_dir, 'task2_trainX.npy'), X) np.save(os.path.join(data_dir, 'task2_trainY.npy'), Y) np.save(os.path.join(data_dir, 'task2_train_seg.npy'), seg) else: X, Y, seg = np.load(os.path.join( data_dir, 'task2_trainX.npy')), np.load( os.path.join(data_dir, 'task2_trainY.npy')), np.load( os.path.join(data_dir, 'task2_train_seg.npy')) return X, Y, seg
def __init__(self, batch_size, gpu_num, gpu_name): gpu_option(gpu_name, gpu_num) self.batch_size = batch_size print("##### load KerasBERT start #####") # Path model_path = 'models/BERT/pretrained_model/uncased_L-24_H-1024_A-16' config_path = os.path.join(model_path, 'bert_config.json') checkpoint_path = os.path.join(model_path, 'bert_model.ckpt') vocab_path = os.path.join(model_path, 'vocab.txt') token_dict = load_vocabulary(vocab_path) model = load_trained_model_from_checkpoint(config_path, checkpoint_path) if gpu_num >= 2: self.par_model = multi_gpu_model(model, gpus=gpu_num) else: self.par_model = model self.tokenizer = Tokenizer(token_dict) print("##### load KerasBERT end #####")
def __init__(self, gpu_name, gpu_num, seq_max_len, batch_size): print('--' * 10 + ' Load BERT model start ' + '--' * 10) gpu_option(gpu_name, gpu_num) self.seq_max_len = seq_max_len # same to train self.batch_size = batch_size model_path = 'models/BERT/pretrained_model/uncased_L-24_H-1024_A-16' vocab_path = os.path.join(model_path, 'vocab.txt') # load Tokenizer token_dict = load_vocabulary(vocab_path) self.tokenizer = Tokenizer(token_dict) MODEL_SAVE_PATH = 'models/BERT/fine_tune_model/bert_fine_tune.hdf5' model = load_model(MODEL_SAVE_PATH, custom_objects=get_custom_objects(), compile=False) if gpu_num >= 2: self.par_model = multi_gpu_model(model, gpus=gpu_num) else: self.par_model = model print('--' * 10 + ' Load BERT model end ' + '--' * 10)
def preprocess(char_seqs, tag_seqs, vocab_file, SEQ_LEN=512, cased=True, tag_vocab=None, TAG_PAD=''): from keras_bert import load_vocabulary # Load vocab vocab = load_vocabulary(vocab_file) # preprocess char_seqs token_id_seqs = preprocess_char(char_seqs, vocab, SEQ_LEN, cased) # create segment_seqs segment_seqs = create_segment(len(token_id_seqs), len(token_id_seqs[0])) # preprocess tag_seqs one_hot_tag_id_seqs, tag_vocab = preprocess_tag(tag_seqs, SEQ_LEN, tag_vocab, TAG_PAD) return token_id_seqs, segment_seqs, one_hot_tag_id_seqs, tag_vocab
data.append([row[0], row[1], int(row[2])]) return data # load data, generate train data and valid data all_data = read_data(data_path) valid_num = int(len(all_data) * valid_data_ratio) train_num = len(all_data) - valid_num train_data = all_data[:train_num] valid_data = all_data[train_num:] print('data number:', len(all_data)) print('train data number:', len(train_data)) print('valid data number:', len(valid_data)) # load Tokenizer token_dict = load_vocabulary(vocab_path) tokenizer = Tokenizer(token_dict) class data_generator: def __init__(self, data, batch_size): self.data = data self.batch_size = batch_size self.steps = len(self.data) // self.batch_size if len(self.data) % self.batch_size != 0: self.steps += 1 def __len__(self): return self.steps def __iter__(self):
) if len(sys.argv) == 2: model_path = sys.argv[1] else: from keras_bert.datasets import get_pretrained, PretrainedList model_path = get_pretrained(PretrainedList.chinese_base) paths = get_checkpoint_paths(model_path) model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=10) model.summary(line_length=120) token_dict = load_vocabulary(paths.vocab) tokenizer = Tokenizer(token_dict) text = '语言模型' tokens = tokenizer.tokenize(text) print('Tokens:', tokens) indices, segments = tokenizer.encode(first=text, max_len=10) predicts = model.predict([np.array([indices]), np.array([segments])])[0] for i, token in enumerate(tokens): print(token, predicts[i].tolist()[:5]) """Official outputs: { "linex_index": 0, "features": [ {
if len(sys.argv) != 2: print('python load_model.py UNZIPPED_MODEL_PATH') sys.exit(-1) print( 'This demo demonstrates how to load the pre-trained model and extract the sentence embedding with pooling.' ) model_path = sys.argv[1] config_path = os.path.join(model_path, 'bert_config.json') checkpoint_path = os.path.join(model_path, 'bert_model.ckpt') dict_path = os.path.join(model_path, 'vocab.txt') model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=10) pool_layer = MaskedGlobalMaxPool1D(name='Pooling')(model.output) model = keras.models.Model(inputs=model.inputs, outputs=pool_layer) model.summary(line_length=120) token_dict = load_vocabulary(dict_path) tokenizer = Tokenizer(token_dict) text = '语言模型' tokens = tokenizer.tokenize(text) print('Tokens:', tokens) indices, segments = tokenizer.encode(first=text, max_len=10) predicts = model.predict([np.array([indices]), np.array([segments])])[0] print('Pooled:', predicts.tolist()[:5])
z_labels = to_categorical(z_labels, num_classes=num_classes) # 数据格式转换成模型需要的输入格式 train_data = [ np.array([x_data[i].replace(' ', ''), x_labels[i]]) for i in range(len(x_labels)) ] valid_data = [ np.array([y_data[i].replace(' ', ''), y_labels[i]]) for i in range(len(y_labels)) ] test_data = [ np.array([z_data[i].replace(' ', ''), z_labels[i]]) for i in range(len(z_labels)) ] # 读取字典 token_dict = load_vocabulary(BertConfig.dict_path) #重写tokenizer class OurTokenizer(Tokenizer): def _tokenize(self, text): R = [] for c in text: if c in self._token_dict: R.append(c) elif self._is_space(c): R.append('[unused1]') # 用[unused1]来表示空格类字符 else: R.append('[UNK]') # 不在列表的字符用[UNK]表示 return R
def test_load_vocabulary(self): current_path = os.path.dirname(os.path.abspath(__file__)) vocab_path = os.path.join(current_path, 'test_checkpoint', 'vocab.txt') token_dict = load_vocabulary(vocab_path) self.assertEqual(15, len(token_dict))
def train(dataset='weibo'): if dataset == 'weibo': pretrained_path = '/XXXX/corpus/chinese_L-12_H-768_A-12' # for Chinese in weibo elif dataset == 'Twitter': pretrained_path = '/XXXX/corpus/uncased_L-12_H-768_A-12' # for English in Twitter else: raise ValueError('ERROR! dataset must be weibo or Twitter!') config_path = '{}/bert_config.json'.format(pretrained_path) checkpoint_path = '{}/bert_model.ckpt'.format(pretrained_path) vocab_path = '{}/vocab.txt'.format(pretrained_path) token_dict = load_vocabulary(vocab_path) tokenizer = Tokenizer(token_dict) model = load_trained_model_from_checkpoint(config_path, checkpoint_path) model.summary(line_length=120) ############################################ # get formated data from different dataset if dataset == 'weibo': matrix_save_dir = './weibo_dataset' train_t_m, train_i_m, train_l, train_el = get_weibo_matrix( 'train', tokenizer) test_t_m, test_i_m, test_l, test_el = get_weibo_matrix( 'test', tokenizer) else: matrix_save_dir = './Twitter_dataset' train_t_m, train_i_m, train_l = get_twitter_matrix('train', tokenizer) test_t_m, test_i_m, test_l = get_twitter_matrix('test', tokenizer) train_text_matrix = [] for b in tqdm.tqdm(train_t_m): results = model.predict([b, np.array([0 for i in range(seq_len)])])[0] train_text_matrix.append(results) train_text_matrix = np.array(train_text_matrix) test_text_matrix = [] for b in tqdm.tqdm(test_t_m): b = np.expand_dims(np.array(b), axis=0) results = model.predict([b, np.array([0 for i in range(seq_len)])])[0] test_text_matrix.append(results) test_text_matrix = np.array(test_text_matrix) train_t_m = np.array(train_t_m) train_i_m = np.array(train_i_m) train_l = np.array(train_l) test_t_m = np.array(test_t_m) test_i_m = np.array(test_i_m) test_l = np.array(test_l) print('4. train text:', train_t_m.shape) print('train text emb:', train_text_matrix.shape) print('train image emb:', train_i_m.shape) print('train label emb:', train_l.shape) print('5. test text:', test_t_m.shape) print('test text emb:', test_text_matrix.shape) print('test image emb:', test_i_m.shape) print('test labels emb:', test_l.shape) np.save('{}/train_text'.format(matrix_save_dir), train_t_m) np.save('{}/train_text_embed'.format(matrix_save_dir), train_text_matrix) np.save('{}/train_image_embed'.format(matrix_save_dir), train_i_m) np.save('{}/train_label'.format(matrix_save_dir), train_l) np.save('{}/test_text'.format(matrix_save_dir), test_t_m) np.save('{}/test_text_embed'.format(matrix_save_dir), test_text_matrix) np.save('{}/test_image_embed'.format(matrix_save_dir), test_i_m) np.save('{}/test_label'.format(matrix_save_dir), test_l)
def encode(self, query: Query, col_orders=None): tokens, tokens_lens = self.tokenize(query, col_orders) #['[CLS]', '我', '想', '你', '帮', '我', '查', '一', '下', '第', '四', '周', '大', '黄', '蜂', ',', '还', '有', '密', '室', '逃', '生', '这', '两', '部', '电', '影', '票', '房', '的', '占', '比', '加', '起', '来', '会', '是', '多', '少', '来', '着', '[SEP]', '[unused11]', '影', '片', '名', '称', '[SEP]', '[unused12]', '周', '票', '房', '[SEP]', '[unused12]', '票', '房', '占', '比', '[SEP]', '[unused12]', '场', '均', '人', '次', '[SEP]'] # tokens_lens :[42, 6, 5, 6, 6] ? 什么意思? token_ids = self._convert_tokens_to_ids( tokens ) # token_ids = self._convert_tokens_to_ids(tokens),将所有汉字用语料库中的id表示 segment_ids = [0] * len(token_ids) # 为啥只是简单的相乘? #[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] header_indices = np.cumsum( tokens_lens ) # axis=0,按照行累加,axis=1,按照列累加。axis不给定具体值,就把numpy数组当成一个一维数组。 return token_ids, segment_ids, header_indices[:-1] token_dict = load_vocabulary(paths.vocab) # 此处加载了语料库!!! query_tokenizer = QueryTokenizer(token_dict) print('QueryTokenizer\n') print('Input Question:\n{}\n'.format(sample_query.question)) print('Input Header:\n{}\n'.format(sample_query.table.header)) print('Output Tokens:\n{}\n'.format(' '.join( query_tokenizer.tokenize(sample_query)[0]))) print('Output token_ids:\n{}\nOutput segment_ids:\n{}\nOutput header_ids:\n{}'. format(*query_tokenizer.encode(sample_query))) class SqlLabelEncoder: """ Convert SQL object into training labels. """
'--BERT_MODEL', default=CONFIG.bert_model, help='bert模型ckpt') parse.add_argument('-v', '--VOCAB', default=CONFIG.vocab, help='bert词汇表') parse.add_argument('-m', '--MODEL_PATH', default=CONFIG.bert_model, help='模型保存路径') parse.add_argument('-lg', '--LOG_PATH', default=CONFIG.bert_model, help='训练日志路径') args = parse.parse_args() # 加载bert分词器 token_dict = load_vocabulary(vocab_path=args.VOCAB) tokenizer = Tokenizer(token_dict=token_dict) def data_padding(data, padding=0): """ 数据padding :param data: :param padding: :return: """ data_len = [len(d) for d in data] M_L = max(data_len) return np.array([ np.concatenate([d, (M_L - len(d)) * [padding]]) if len(d) < M_L else d for d in data
def pretrain_model(opt_filepath, data_dir, gpu_id): os.environ['CUDA_VISIBLE_DEVICES'] = gpu_id #gpus = tf.config.experimental.list_physical_devices('GPU') #tf.config.experimental.set_memory_growth(gpus[0], True) token_dict = load_vocabulary(dict_path) token_list = list(token_dict.keys()) #if not os.path.exists(os.path.join(data_dir, 'pretrain_X.npy')): df = pd.read_csv(os.path.join(data_dir, 'task2_trainset.csv'), dtype=str) df_2 = pd.read_csv(os.path.join(data_dir, 'task2_public_testset.csv'), dtype=str) abstract_1 = df.values[:, 2] abstract_2 = df_2.values[:, 2] tokenizer = Tokenizer(token_dict) X_1 = collect_inputs(abstract_1, tokenizer) X_2 = collect_inputs(abstract_2, tokenizer) X = np.array(X_1 + X_2) # np.save(os.path.join(data_dir, 'pretrain_X.npy'), X) #else: # X = np.load(os.path.join(data_dir, 'pretrain_X.npy')) print(X.shape) model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True, trainable=get_layers_name( range(12, 25)), seq_len=512) compile_model(model) def _generator(batch_size=4): while True: idx = np.random.permutation(X.shape[0]) for i in range(0, idx.shape[0], batch_size): yield gen_batch_inputs(X[i:i + batch_size], token_dict, token_list, seq_len=512, mask_rate=0.3) checkpoint = ModelCheckpoint(opt_filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True) trainable_layer = list(range(12 * 8, 19 * 8, 8)) batch_size = [3] * 3 + [3] * 3 for i, layer_i in enumerate(trainable_layer): for j, layer in enumerate(model.layers): if j >= layer_i: layer.trainable = True print(layer.name, layer.trainable) else: layer.trainable = False compile_model(model) if os.path.exists(opt_filepath): model.load_weights(opt_filepath) es = EarlyStopping(monitor='val_loss', patience=20) reduce_lr = ReduceLROnPlateau(factor=0.7, patience=4, verbose=1, min_lr=1e-6) callbacks_list = [checkpoint, es, reduce_lr] model.fit_generator(generator=_generator(batch_size[i]), steps_per_epoch=500, epochs=5000, validation_data=_generator(), validation_steps=200, callbacks=callbacks_list)
def main(): args = get_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) np.random.seed(args.seed) if args.verbose: log.basicConfig(level=log.DEBUG, stream=sys.stdout) else: log.basicConfig(level=log.INFO, stream=sys.stdout) log.info('\n' + tabulate(sorted(vars(args).items()))) set_logger(os.path.join(args.output_dir, args.log_file)) pick_device() data = load_instances(args.dataset, args.label_col) classes = list(sorted(set(data[args.label_col]))) args.n_classes = len(classes) token_dict = load_vocabulary(args.vocab_file) tokenizer = Tokenizer(token_dict) if args.do_train: folds = [i for i in args.train_dataset.split(',')] train_df = data[data['fold'].isin(folds)].reset_index(drop=True) train_generator = TextDataFrameIterator( dataframe=train_df, tokenizer=tokenizer, classes=classes, x_col=args.text_col, y_col=args.label_col, batch_size=args.batch_size, shuffle=True, seq_len=args.max_seq_length, seed=args.seed, do_lower_case=args.do_lower_case ) folds = [i for i in args.val_dataset.split(',')] val_df = data[data['fold'].isin(folds)].reset_index(drop=True) val_generator = TextDataFrameIterator( dataframe=val_df, tokenizer=tokenizer, classes=classes, x_col=args.text_col, y_col=args.label_col, batch_size=args.batch_size, shuffle=False, seq_len=args.max_seq_length, do_lower_case=args.do_lower_case ) total_steps, warmup_steps = calc_train_steps( num_example=len(train_df), batch_size=args.batch_size, epochs=args.epochs, warmup_proportion=args.warmup_proportion, ) model = get_model(args) earlystop = callbacks.EarlyStopping( monitor='val_loss', min_delta=K.epsilon(), patience=args.earlystop, verbose=1, mode='auto') best_checkpoint = callbacks.ModelCheckpoint( os.path.join(args.output_dir, args.best_model), save_best_only=True, save_weights_only=False, monitor='val_loss', mode='min', verbose=1) csv_logger = callbacks.CSVLogger(os.path.join(args.output_dir, args.csv_logger)) callbacks_list = [earlystop, best_checkpoint, csv_logger] optimizer = AdamWarmup( decay_steps=total_steps, warmup_steps=warmup_steps, lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-6, min_lr=1e-5, weight_decay=0.01, weight_decay_pattern=['embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo'] ) model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy']) cw = get_class_weights(data, args.label_col, train_generator.class_indices) model.fit_generator( train_generator, class_weight=cw, use_multiprocessing=False, workers=args.workers, callbacks=callbacks_list, epochs=args.epochs, validation_data=val_generator, verbose=1) if args.do_test: folds = [i for i in args.test_dataset.split(',')] test_df = data[data['fold'].isin(folds)].reset_index(drop=True) test_generator = TextDataFrameIterator( dataframe=test_df, tokenizer=tokenizer, classes=classes, x_col=args.text_col, y_col=args.label_col, batch_size=args.batch_size, shuffle=False, seq_len=args.max_seq_length, do_lower_case=args.do_lower_case ) print('Load from %s', os.path.join(args.output_dir, args.best_model)) model = load_model(os.path.join(args.output_dir, args.best_model), custom_objects=get_custom_objects()) # model.summary() y_score = model.predict_generator( test_generator, use_multiprocessing=False, workers=args.workers, verbose=1) y_pred = np.argmax(y_score, axis=1) pred_df = pd.DataFrame(y_score, columns=classes) pred_df = pred_df.assign(predictions=[classes[lbl] for lbl in y_pred]) y_true = test_df.loc[:, args.label_col].values y_pred = pred_df['predictions'].values report = pmetrics.classification_report(y_true, y_pred, classes=classes) print(report.summary()) # print('auc', pmetrics.auc(y_true, y_score, y_column=1)[0]) result = pd.concat([test_df, pred_df], axis=1) result.to_csv(os.path.join(args.output_dir, args.test_predictions), index=False) if args.do_predict: test_df = load_instances(args.pred_dataset, args.label_col) test_generator = TextDataFrameIterator( dataframe=test_df, tokenizer=tokenizer, classes=None, x_col=args.text_col, y_col=args.label_col, batch_size=args.batch_size, shuffle=False, seq_len=args.max_seq_length, do_lower_case=args.do_lower_case ) print('Load from %s', os.path.join(args.output_dir, args.best_model)) model = load_model(os.path.join(args.output_dir, args.best_model), custom_objects=get_custom_objects()) # model.summary() y_score = model.predict_generator( test_generator, use_multiprocessing=False, workers=args.workers, verbose=1) y_pred = np.argmax(y_score, axis=1) pred_df = pd.DataFrame(y_score, columns=classes) pred_df = pred_df.assign(predictions=[classes[lbl] for lbl in y_pred]) result = pd.concat([test_df, pred_df], axis=1) result.to_csv(os.path.join(args.output_dir, args.pred_predictions), index=False) if args.do_debug: for dataset in [args.train_dataset, args.val_dataset, args.test_dataset]: folds = [i for i in dataset.split(',')] print('folds:', folds) sub_df = data[data['fold'].isin(folds)] generator = TextDataFrameIterator( dataframe=sub_df, tokenizer=tokenizer, x_col=args.text_col, y_col=args.label_col, batch_size=args.batch_size, shuffle=False, seq_len=args.max_seq_length, ) for i, ([tokens, _], labels) in enumerate(generator): print(tokens.shape, type(tokens), labels.shape, type(labels)) if i == 2: break
def construct_model(paths, use_multi_gpus=True): token_dict = load_vocabulary(paths.vocab) query_tokenizer = QueryTokenizer(token_dict) num_sel_agg = len(SQL.agg_sql_dict) + 1 num_cond_op = len(SQL.op_sql_dict) + 1 num_cond_conn_op = len(SQL.conn_sql_dict) bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=None) for l in bert_model.layers: l.trainable = True inp_token_ids = Input(shape=(None, ), name='input_token_ids', dtype='int32') inp_segment_ids = Input(shape=(None, ), name='input_segment_ids', dtype='int32') inp_header_ids = Input(shape=(None, ), name='input_header_ids', dtype='int32') inp_header_mask = Input(shape=(None, ), name='input_header_mask') x = bert_model([inp_token_ids, inp_segment_ids]) # (None, seq_len, 768) x_for_cond_conn_op = Lambda(lambda x: x[:, 0])(x) # (None, 768) p_cond_conn_op = Dense(num_cond_conn_op, activation='softmax', name='output_cond_conn_op')(x_for_cond_conn_op) x_for_header = Lambda(seq_gather, name='header_seq_gather')([x, inp_header_ids ]) # (None, h_len, 768) header_mask = Lambda(lambda x: K.expand_dims(x, axis=-1))( inp_header_mask) # (None, h_len, 1) x_for_header = Multiply()([x_for_header, header_mask]) x_for_header = Masking()(x_for_header) p_sel_agg = Dense(num_sel_agg, activation='softmax', name='output_sel_agg')(x_for_header) x_for_cond_op = Concatenate(axis=-1)([x_for_header, p_sel_agg]) p_cond_op = Dense(num_cond_op, activation='softmax', name='output_cond_op')(x_for_cond_op) label_encoder = SqlLabelEncoder() model = Model( [inp_token_ids, inp_segment_ids, inp_header_ids, inp_header_mask], [p_cond_conn_op, p_sel_agg, p_cond_op]) NUM_GPUS = 2 learning_rate = 1e-5 if use_multi_gpus: print('using {} gpus'.format(NUM_GPUS)) model = multi_gpu_model(model, gpus=NUM_GPUS) model.compile(loss='sparse_categorical_crossentropy', optimizer=RAdam(lr=learning_rate)) return model, query_tokenizer