def simple_bert(): set_seed(33) opt = Adam(learning_rate=2e-5) id_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) mask_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) atn_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) config = BertConfig() config.output_hidden_states = False # Set to True to obtain hidden states bert_model = TFBertModel.from_pretrained('bert-base-uncased', config=config) embedding = bert_model(id_, attention_mask=mask_, token_type_ids=atn_)[0] x = Bidirectional( # 加上这个就变成了双向lstm LSTM( # 这个是单向lstm 64, # 权重初始化 kernel_initializer='he_normal', # 返回每个token的输出,如果设置为False 只出最后一个。 return_sequences=True))(embedding) #x=Attention(128)(x) x = GlobalAveragePooling1D()(x) #x = Dropout(0.2)(x) #x = Dense(64, activation='relu')(embedding) out = Dense(len(map_label), activation='softmax')(x) model = Model(inputs=[id_, mask_, atn_], outputs=out) model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy']) return model
def simple_bert(): set_seed(33) opt = Adam(learning_rate=2e-5) id_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) mask_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) atn_ = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) config = BertConfig() config.output_hidden_states = False # Set to True to obtain hidden states bert_model = TFBertModel.from_pretrained('bert-base-uncased', config=config) embedding = bert_model(id_, attention_mask=mask_, token_type_ids=atn_)[0] x = GlobalAveragePooling1D()(embedding) #x = Dropout(0.2)(x) #x = Dense(64, activation='relu')(embedding) out = Dense(len(map_label), activation='softmax')(x) model = Model(inputs=[id_, mask_, atn_], outputs=out) model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy']) return model
def get_bert_hidden(n_hidden_layers=1, bert_path=BERT_PATH): input_word_ids = tf.keras.layers.Input( (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids') input_masks = tf.keras.layers.Input( (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks') input_segments = tf.keras.layers.Input( (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments') config = BertConfig() config.output_hidden_states = True bert_layer = TFBertModel.from_pretrained(bert_path, config=config) hidden_layers = bert_layer([input_word_ids, input_masks, input_segments])[-1] selected_hidden_layers = list() for i in range(n_hidden_layers): layer_idx = -(i+1) selected_hidden_layers.append( tf.reshape(hidden_layers[layer_idx][:,0], (-1,768)) ) if n_hidden_layers > 1: output_layer = tf.keras.layers.concatenate(inputs=selected_hidden_layers, axis=1) else: output_layer = selected_hidden_layers[0] bert_model = tf.keras.models.Model(inputs=[input_word_ids, input_masks, input_segments], outputs=output_layer) return bert_model
def __init__(self, bert_path, dropout, hidden_size, output_size): super().__init__() config = BertConfig() config.output_hidden_states = True self.bert_layer = BertModel.from_pretrained(bert_path, config=config) self.dropout_layer = torch.nn.Dropout(dropout) self.linear_layer = torch.nn.Linear(hidden_size, output_size) self.activation = torch.nn.Sigmoid()
def get_book_feature(b_isbn_embedd, b_author_embedd, b_year_embedd, b_publisher_embedd, book_title_id, book_title_type_id, book_title_mask): # 首先对前4个特征连接Dense层 b_isbn_dense = keras.layers.Dense(b_dense, activation='relu', kernel_regularizer=tf.nn.l2_loss, name='b_isbn_dense')(b_isbn_embedd) b_author_dense = keras.layers.Dense(b_dense, activation='relu', kernel_regularizer=tf.nn.l2_loss, name='b_author_dense')(b_author_embedd) b_year_dense = keras.layers.Dense(b_dense, activation='relu', kernel_regularizer=tf.nn.l2_loss, name='b_year_dense')(b_year_embedd) b_publisher_dense = keras.layers.Dense( b_dense, activation='relu', kernel_regularizer=tf.nn.l2_loss, name='b_publisher_dense')(b_publisher_embedd) # 合并这四个特征, b_combine_four shape = (?, 1, 16) b_combine_four = keras.layers.concatenate( [b_isbn_dense, b_author_dense, b_year_dense, b_publisher_dense], name='b_four_combine') print('b_combine_four.shape', b_combine_four.shape) b_combine_four_reshape = keras.layers.Reshape( [b_combine_four.shape[2]], name='b_combine_four_reshape')(b_combine_four) config = BertConfig() # 获取隐藏层的信息 config.output_hidden_states = True bert_model = TFBertModel.from_pretrained(bert_path + 'bert-base-uncased-tf_model.h5', config=config) book_title_cls = bert_model(book_title_id, attention_mask=book_title_mask, token_type_ids=book_title_type_id) print(len(book_title_cls)) print(book_title_cls[0].shape) print(book_title_cls[1].shape) book_feature_layer = keras.layers.Dense(64, activation='tanh')( book_title_cls[1]) b_combine_book = keras.layers.concatenate( [book_feature_layer, b_combine_four_reshape], axis=1, name='b_combine_book') # 得到书籍矩阵 b_feature_layer = keras.layers.Dense(200, name='b_feature_layer', activation='tanh')(b_combine_book) return b_feature_layer
def dual_bert(): set_seed(33) opt = Adam(learning_rate=2e-5) id1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) id2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) mask1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) mask2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) atn1 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) atn2 = Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) config = BertConfig() config.output_hidden_states = False # Set to True to obtain hidden states bert_model1 = TFBertModel.from_pretrained('bert-base-uncased', config=config) bert_model2 = TFBertModel.from_pretrained('bert-base-uncased', config=config) embedding1 = bert_model1(id1, attention_mask=mask1, token_type_ids=atn1)[0] embedding2 = bert_model2(id2, attention_mask=mask2, token_type_ids=atn2)[0] x = Concatenate()([embedding1, embedding2]) x = keras.layers.Bidirectional( # 加上这个就变成了双向lstm keras.layers.LSTM( # 这个是单向lstm 64, # 权重初始化 kernel_initializer='he_normal', # 返回每个token的输出,如果设置为False 只出最后一个。 return_sequences=True))(x) #x = Lambda(lambda x: x[:, 0], name='CLS-token')(x)#降维 #x1 = GlobalAveragePooling1D()(embedding1) #x2 = GlobalAveragePooling1D()(embedding2) #x = Concatenate()([x1, x2]) x = Attention(128)(x) # 加入attention x = Dense(64, activation='relu')(x) x = Dropout(0.2)(x) #out = Dense(len(map_label), activation='softmax')(x) out = Dense(5, activation='softmax')(x) model = Model(inputs=[id1, mask1, atn1, id2, mask2, atn2], outputs=out) model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy']) #加个评测指标 return model
def get_book_feature(book_title_id, book_title_type_id, book_title_mask): config = BertConfig() # 获取隐藏层的信息 config.output_hidden_states = True bert_model = TFBertModel.from_pretrained(bert_path + 'bert-base-uncased-tf_model.h5', config=config) book_title_cls = bert_model(book_title_id, attention_mask=book_title_mask, token_type_ids=book_title_type_id) print(len(book_title_cls)) print(book_title_cls[0].shape) print(book_title_cls[1].shape) book_feature_layer = keras.layers.Dense(100, activation='tanh')( book_title_cls[1]) return book_feature_layer
def __init__(self, config: BertConfig, num_hidden_layers=None): super().__init__() self.logger = get_logger(__name__) config.output_hidden_states = True self.embeddings = BertEmbeddings(config) num_hidden_layers = config.num_hidden_layers if num_hidden_layers is None else num_hidden_layers assert num_hidden_layers > 0, 'bert_layers must > 0' # 需要注意的是和原始transformer的BERT_Encoder的输出不一样 self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states layer = BertLayer(config) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)]) self.config = config self.num_hidden_layers = num_hidden_layers self.apply(self.init_bert_weights)
def create_model1(): q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) a_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) a_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) a_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) config = BertConfig() config.output_hidden_states = False bert_model = TFBertModel.from_pretrained('bert-base-uncased', config=config) q_embedding = bert_model(q_id, attention_mask=q_mask, token_type_ids=q_atn)[0] a_embedding = bert_model(a_id, attention_mask=a_mask, token_type_ids=a_atn)[0] q = tf.keras.layers.GlobalAveragePooling1D()(q_embedding) a = tf.keras.layers.GlobalAveragePooling1D()(a_embedding) x = tf.keras.layers.Concatenate()([q, q]) x = tf.keras.layers.Reshape((1, x.shape[-1]))(x) cnn = tf.keras.layers.Conv1D(64, 3, padding='same', activation='relu')(x) cnn = tf.keras.layers.MaxPooling1D(pool_size=1, strides=2)(cnn) cnn = tf.keras.layers.BatchNormalization()(cnn) lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64))(cnn) lstm = tf.keras.layers.Dropout(0.2)(lstm) dense = tf.keras.layers.Dense(64, activation='relu')(lstm) x = tf.keras.layers.BatchNormalization()(x) x = tf.keras.layers.Dense(TARGET_COUNT1, activation='softmax')(dense) x = tf.keras.layers.BatchNormalization()(x) model = tf.keras.models.Model( inputs=[q_id, q_mask, q_atn, a_id, a_mask, a_atn], outputs=x) return model
def create_model(): q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) config = BertConfig() config.output_hidden_states = False bert_model = TFBertModel.from_pretrained('bert-base-uncased', config=config) q_embedding = bert_model(q_id, attention_mask=q_mask, token_type_ids=q_atn)[0] q = tf.keras.layers.GlobalAveragePooling1D()(q_embedding) x = tf.keras.layers.Dropout(0.2)(q) x = tf.keras.layers.Dense(TARGET_COUNT, activation='sigmoid')(x) model = tf.keras.models.Model(inputs=[q_id, q_mask, q_atn], outputs=x) return model
def create_model(): id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) attn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH, ), dtype=tf.int32) config = BertConfig() config.output_hidden_states = True bert_model = TFBertModel.from_pretrained( 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5', config=config) _, _, hidden_states = bert_model(id, attention_mask=mask, token_type_ids=attn) h12 = tf.reshape(hidden_states[-1][:, 0], (-1, 1, 768)) h11 = tf.reshape(hidden_states[-2][:, 0], (-1, 1, 768)) h10 = tf.reshape(hidden_states[-3][:, 0], (-1, 1, 768)) h09 = tf.reshape(hidden_states[-4][:, 0], (-1, 1, 768)) concat_hidden = tf.keras.layers.Concatenate(axis=2)([h12, h11, h10, h09]) x = tf.keras.layers.GlobalAveragePooling1D()(concat_hidden) x = tf.keras.layers.Dropout(0.2)(x) x = tf.keras.layers.Dense(MAX_TARGET_LEN, activation='sigmoid')(x) model = tf.keras.models.Model(inputs=[id, mask, attn], outputs=x) return model
def __init__( self, pretrained_bert_model, language, name, prediction_type, output_hidden_states, output_attentions, attention_length_before=1, attention_length_after=1, config_path=None, max_length=512, number_of_sentence=1, number_of_sentence_before=0, number_of_sentence_after=0, seed=1111, hidden_dropout_prob=0., attention_probs_dropout_prob=0., stop_attention_at_sent_before=None, stop_attention_before_sent=0, ): super(BertExtractor, self).__init__() # Load pre-trained model tokenizer (vocabulary) # Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization. if config_path is None: configuration = BertConfig() configuration.hidden_dropout_prob = hidden_dropout_prob configuration.attention_probs_dropout_prob = attention_probs_dropout_prob configuration.output_hidden_states = output_hidden_states configuration.output_attentions = output_attentions self.model = BertModel.from_pretrained( pretrained_bert_model, config=configuration) #, config=configuration else: self.model = BertModel.from_pretrained( pretrained_bert_model) #, config=configuration self.tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_model) self.language = language self.attention_length_before = attention_length_before self.attention_length_after = attention_length_after self.pretrained_bert_model = pretrained_bert_model self.NUM_HIDDEN_LAYERS = self.model.config.num_hidden_layers self.FEATURE_COUNT = self.model.config.hidden_size self.NUM_ATTENTION_HEADS = self.model.config.num_attention_heads self.name = name self.config = { 'max_length': max_length, 'seed': seed, 'number_of_sentence': number_of_sentence, 'number_of_sentence_before': number_of_sentence_before, 'number_of_sentence_after': number_of_sentence_after, 'attention_length_before': attention_length_before, 'attention_length_after': attention_length_after, 'stop_attention_at_sent_before': stop_attention_at_sent_before, 'stop_attention_before_sent': stop_attention_before_sent, 'output_hidden_states': output_hidden_states, 'output_attentions': output_attentions, 'model_type': 'bert', 'hidden_size': self.model.config.hidden_size, 'hidden_act': self.model.config.hidden_act, 'initializer_range': self.model.config.initializer_range, 'vocab_size': self.model.config.vocab_size, 'hidden_dropout_prob': self.model.config.hidden_dropout_prob, 'num_attention_heads': self.model.config.num_attention_heads, 'type_vocab_size': self.model.config.type_vocab_size, 'max_position_embeddings': self.model.config.max_position_embeddings, 'num_hidden_layers': self.model.config.num_hidden_layers, 'intermediate_size': self.model.config.intermediate_size, 'attention_probs_dropout_prob': self.model.config.attention_probs_dropout_prob } if config_path is not None: with open(config_path, 'r') as f: self.config.update(json.load(f)) self.prediction_type = prediction_type # ['sentence', 'token-level']
def run_model(pos_train_file, neg_train_file, pos_dev_file, neg_dev_file, nrows_train, nrows_dev, epochs, out_dir): batch_size = 16 #x_train = _read_data('../data/train_bal.csv', nrows_train) #x_dev = _read_data('../data/dev_bal.csv', nrows_dev) #train_data = list( zip( x_train['comment_text'].values, x_train['target'].values )) #train_dataloader = DataLoader( train_data, # collate_fn=my_collate, # batch_size=batch_size , shuffle=True, ) # # #dev_data = list( zip( x_dev['comment_text'].values, x_dev['target'].values )) #dev_dataloader = DataLoader( dev_data, # collate_fn=my_collate, # batch_size=batch_size, shuffle=False, ) train_dataloader = get_data_loader_bal(pos_train_file, neg_train_file, batch_size=batch_size, nrows_pos=nrows_train, nrows_neg=nrows_train, mode='train') dev_dataloader = get_data_loader_bal(pos_dev_file, neg_dev_file, batch_size=batch_size, nrows_pos=nrows_dev, nrows_neg=nrows_dev, mode='dev') device = get_device() bert_hidden_states = 4 config = BertConfig() config.output_hidden_states = True model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels= 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) model = model.to(device) optimizer = AdamW( model.parameters(), lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) if not os.path.exists(out_dir): os.makedirs(out_dir) best_score = -np.inf stats_vec = [] for epoch in range(epochs): stats = train_epoch(model, train_dataloader, dev_dataloader, optimizer, scheduler) print(stats) if stats['accuracy'] > best_score: best_score = stats['accuracy'] f = out_dir + '/' + 'best_model_ch.pt' torch.save({ 'epoch': epoch, 'model': model, 'stats': stats, }, f) stats_vec.append(stats) stats_vec = pd.DataFrame(stats_vec) f = out_dir + '/' + 'last_model_ch.pt' torch.save({ 'epoch': epoch, 'model': model, 'stats': stats, }, f) print(stats_vec) stats_vec.to_csv(out_dir + '/' + 'stats.csv')
def __init__(self, bert_path): super().__init__() config = BertConfig() config.output_hidden_states = True self.bert_layer = BertModel.from_pretrained(bert_path, config=config)
tokenizer_new = BertTokenizer.from_pretrained(distil_bert, do_lower_case=True, add_special_tokens=True,max_length=128, pad_to_max_length=True) def tokenize_new(sentences, tokenizer_new): input_ids, input_masks, input_segments = [],[],[] for sentence in tqdm(sentences): inputs = tokenizer_new.encode_plus(sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=True) input_ids.append(inputs['input_ids']) input_masks.append(inputs['attention_mask']) input_segments.append(inputs['token_type_ids']) return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32') config1 = BertConfig(dropout=0.2, attention_dropout=0.2) config1.output_hidden_states = False transformer_model =TFBertModel.from_pretrained(distil_bert, config = config1) input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32') input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32') embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0] X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1,kernel_initializer='normal'))(embedding_layer) X = tf.keras.layers.GlobalMaxPool1D()(X) X = tf.keras.layers.Dense(50, activation='relu',kernel_initializer='normal')(X) X = tf.keras.layers.Dropout(0.2)(X) X = tf.keras.layers.Dense(1, activation='linear',kernel_initializer='normal')(X) model1 = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X) for layer in model1.layers[:3]: layer.trainable = False
from utils import DataCollatorForMLM, MLMDataset from sklearn.metrics import precision_recall_fscore_support, accuracy_score # from train_args import TrainingArguments from middle_train import Middle_Trainer import numpy import torch import torch.nn as nn import os import argparse parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int) args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = '1,2,3' config = BertConfig() config.output_hidden_states = True config.vocab_size = 41460 model = AutoModelForMaskedLM.from_config(config) model.bert.embeddings.word_embeddings = nn.Embedding(1839, 768, padding_idx=0) con_tokenizer = BertTokenizer.from_pretrained('y2d1') lab_tokenizer = BertTokenizer.from_pretrained('z2d') data_collator = DataCollatorForMLM(tokenizer=con_tokenizer, mlm=True, mlm_probability=0.2) train_dataset = MLMDataset(con_tokenizer=con_tokenizer, lab_tokenizer=lab_tokenizer, file_path='./data/trainpath') eval_dataset = MLMDataset(con_tokenizer=con_tokenizer, lab_tokenizer=lab_tokenizer, file_path='./data/evalpath') training_args = TrainingArguments(
def run_model(pos_train_file, neg_train_file, pos_dev_file, neg_dev_file, nrows_train, nrows_dev, epochs, out_dir, dropout=0.2, model='bert', batch_size=16, test_file='../data/test_data_clean.csv', lr=2e-5, lmda=10.0, stnc_emb='last'): device = get_device() bert_hidden_states = 4 if model == 'bert': config = BertConfig() config.output_hidden_states = True model = BertForToxic( config, bert_hidden_states=bert_hidden_states, dropout=dropout, update_bert=True, ) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) if model == 'distilbert': #config = DistilBertConfig() config = BertConfig() config.output_hidden_states = True model = DistilBertForToxic(config, bert_hidden_states=bert_hidden_states, dropout=dropout, update_bert=True, lmda=lmda, stnc_emb=stnc_emb) #tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) train_dataloader = get_data_loader_bal(pos_train_file, neg_train_file, batch_size=batch_size, nrows_pos=nrows_train, nrows_neg=nrows_train * 10, mode='train', tokenizer=tokenizer) dev_dataloader = get_data_loader_bal(pos_dev_file, neg_dev_file, batch_size=batch_size, nrows_pos=nrows_dev, nrows_neg=nrows_dev, mode='dev', tokenizer=tokenizer) model.to(device) optimizer = AdamW( model.parameters(), lr=lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) if not os.path.exists(out_dir): os.makedirs(out_dir) best_score = -np.inf stats_vec = [] dev_pred_vec = [] for epoch in range(epochs): stats, dev_pred = train_epoch(model, train_dataloader, dev_dataloader, optimizer, scheduler) print(epoch, stats) if stats['accuracy'] > best_score: best_score = stats['accuracy'] f = out_dir + '/' + 'best_model_ch.pt' torch.save({ 'epoch': epoch, 'model': model, 'stats': stats, }, f) stats_vec.append(stats) dev_pred_vec.append(dev_pred) stats_vec = pd.DataFrame(stats_vec) dev_pred_vec = pd.concat(dev_pred_vec, axis=0) f = out_dir + '/' + 'last_model_ch.pt' torch.save({ 'epoch': epoch, 'model': model, 'stats': stats, }, f) print(stats_vec) stats_vec.to_csv(out_dir + '/' + 'stats.csv') out_file = out_dir + '/train_pred.csv' df = get_data_pred( train_dataloader, model, out_file, ) out_file = out_dir + '/dev_pred.csv' df = get_data_pred( dev_dataloader, model, out_file, ) test_dataloader = get_data_loader_pred(test_file, tokenizer, nrows=None) out_file = out_dir + '/test_pred.csv' df = get_data_pred( test_dataloader, model, out_file, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--task_name", default=None, type=str, help="The name of the task for training.") parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--bert_model", default="bert-base-uncased", type=str, help="student bert model configuration folder") parser.add_argument("--encoder_checkpoint", default=None, type=str, help="check point for student encoder") parser.add_argument("--cls_checkpoint", default=None, type=str, help="check point for student classifier") parser.add_argument("--alpha", default=0.95, type=float, help="alpha for distillation") parser.add_argument("--T", default=10., type=float, help="temperature for distillation") parser.add_argument("--beta", default=0.0, type=float, help="weight for AT loss") parser.add_argument("--fc_layer_idx", default=None, type=str, help="layers ids we will put FC layers on") parser.add_argument("--normalize_patience", default=False, help="normalize patience or not") parser.add_argument("--do_train", action='store_true', help="do training or not") parser.add_argument("--do_eval", action='store_true', help="do evaluation during training or not") parser.add_argument("--train_type", default="finetune_teacher", choices=["finetune_teacher","train_student"], help="choose which to train") parser.add_argument("--log_every_step", default=50, type=int, help="output to log every global x training steps, default is 1") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--logging_steps', type=int, default=1000, help="Log every X updates steps.") parser.add_argument('--student_hidden_layers', type=int, default=12, help="number of transformer layers for student, default is None (use all layers)") parser.add_argument('--teacher_prediction', type=str, default=None, help="teacher prediction file to guild the student's output") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") args = parser.parse_args() args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps logger.info('actual batch size on all GPU = %d' % args.train_batch_size) if args.train_type == 'finetune_teacher': args.student_hidden_layers = 12 if 'base' in args.bert_model else 24 args.alpha = 0.0 # alpha = 0 is equivalent to fine-tuning for KD elif args.train_type == "train_student": args.student_hidden_layers = 6 args.kd_model = "kd.cls" args.alpha = 0.7 args.beta = 500 args.T = 10 args.fc_layer_idx = "1,3,5,7,9" # this for pkd-skip args.normalize_patience = True else: raise ValueError("please pick train_type from finetune_teacher,train_student") if args.encoder_checkpoint is None: args.encoder_checkpoint = os.path.join(args.bert_model, 'pytorch_model.bin') logger.info('encoder checkpoint not provided, use pre-trained at %s instead' % args.encoder_checkpoint) if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir)) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() #args.n_gpu = 1 logger.info("device: {} n_gpu: {}".format(args.device, args.n_gpu)) # set seed set_seed(args) # prepare task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() args.num_labels = len(label_list) # prepare tokenizer and model config = BertConfig() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) config.output_hidden_states = True encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers) classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0) n_student_layer = len(encoder.bert.encoder.layer) encoder = load_model(encoder, args.encoder_checkpoint, args, 'student', verbose=True) logger.info('*' * 77) classifier = load_model(classifier, args.cls_checkpoint, args, 'classifier', verbose=True) n_param_student = count_parameters(encoder) + count_parameters(classifier) logger.info('number of layers in student model = %d' % n_student_layer) logger.info('num parameters in student model are %d' % n_param_student) # Training if args.do_train: read_set = 'train' if args.train_type == "train_student": assert args.teacher_prediction is not None assert args.alpha > 0 logger.info('loading teacher\'s predictoin') teacher_predictions = pickle.load(open(args.teacher_prediction, 'rb'))['train'] if args.teacher_prediction is not None else None logger.info('teacher acc = %.2f, teacher loss = %.5f' % ( teacher_predictions['acc'] * 100, teacher_predictions['loss'])) train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer, SequentialSampler, batch_size=args.train_batch_size, knowledge=teacher_predictions['pred_logit'], extra_knowledge=teacher_predictions[ 'feature_maps']) else: assert args.alpha == 0 logger.info("runing teacher fine-tuning") train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer, SequentialSampler, batch_size=args.train_batch_size) global_step, tr_loss = train(args, train_dataloader, encoder, classifier, tokenizer) ################# # information of teacher model (like [CLS]) ################# if args.train_type == "finetune_teacher": all_res = {'train': None} encoder_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.encoder.pkl') cls_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.cls.pkl') print("encoder_file") encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers) classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0) encoder = load_model(encoder, encoder_file, args, 'exact', verbose=True) classifier = load_model(classifier, cls_file, args, 'exact', verbose=True) train_res = eval_model_dataloader(encoder, classifier, train_dataloader, args.device, detailed=True, verbose=False) all_res['train'] = train_res logger.info('saving teacher results') fname = os.path.join(args.output_dir, args.task_name + f'_teacher_{args.student_hidden_layers}layer_information.pkl') with open(fname, 'wb') as fp: pickle.dump(all_res, fp) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Evaluation if args.do_eval: test_examples, test_dataloader, test_label_ids = get_task_dataloader(args, 'dev', tokenizer, SequentialSampler, batch_size=args.eval_batch_size) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) result = evaluate(args, test_label_ids, encoder,classifier,test_dataloader) output_test_file = os.path.join(args.output_dir, "test_results_" + '.txt') with open(output_test_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return