def extract_embeddings_for_other_clf(): distil_bert = "distilbert-base-uncased" config = DistilBertConfig(dropout=0.2, attention_dropout=0.2) config.output_hidden_states = False transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config=config) input_ids_in = tf.keras.layers.Input(shape=(25, ), name="input_token", dtype="int32") input_masks_in = tf.keras.layers.Input(shape=(25, ), name="masked_token", dtype="int32") embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0] cls_token = embedding_layer[:, 0, :] X = tf.keras.layers.BatchNormalization()(cls_token) X = tf.keras.layers.Dense(192, activation="relu")(X) X = tf.keras.layers.Dropout(0.2)(X) X = tf.keras.layers.Dense(3, activation="softmax")(X) model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=X) for layer in model.layers[:3]: layer.trainable = False return model
def create_model(model_config: CommentClassifierConfig, saved_weights_path: str = None, max_seq_length: int = MAX_SEQ_LENGTH) -> tf.keras.Model: """ :param model_config: CommentClassifierConfig :param saved_weights_path: If defined, model weights will be loaded from the provided checkpoint path :param max_seq_length: Maximum length of the tokenized input to BERT :return: Model for text classification using DistilBert transformers """ # Load pre-trained DistilBERT bert_config = DistilBertConfig( dropout=model_config.bert_dropout, attention_dropout=model_config.bert_attention_dropout, num_labels=NUM_CLASSES) bert_config.output_hidden_states = False transformer_model = TFDistilBertModel.from_pretrained(MODEL_NAME, config=bert_config) input_ids_in = tf.keras.layers.Input(shape=(max_seq_length, ), name='input_token', dtype='int32') input_masks_in = tf.keras.layers.Input(shape=(max_seq_length, ), name='masked_token', dtype='int32') embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0] x = tf.keras.layers.Bidirectional( tf.keras.layers.LSTM( model_config.lstm_units, return_sequences=True, dropout=model_config.lstm_dropout, recurrent_dropout=model_config.lstm_recurrent_dropout))( embedding_layer) x = tf.keras.layers.GlobalMaxPool1D()(x) x = tf.keras.layers.Dense( model_config.hidden_layer_dim, activation=model_config.hidden_layer_activation)(x) x = tf.keras.layers.Dropout(model_config.final_layer_dropout)(x) x = tf.keras.layers.Dense( NUM_CLASSES, activation=model_config.final_layer_activation)(x) model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=x) # Use transfer learning only - do not train BERT again for layer in model.layers[:3]: layer.trainable = False # Load weights from a checkpoint, but allow partial matching # (e.g. due to a change in the optimizer) if saved_weights_path is not None: model.load_weights(saved_weights_path).expect_partial() return model
def classify_with_pre_trained(): # model = "neuralmind/bert-base-portuguese-cased" config = DistilBertConfig(num_labels=3) config.output_hidden_states = False transformer_model = TFDistilBertForSequenceClassification.from_pretrained( distil_bert, config=config)[0] input_ids = tf.keras.layers.Input(shape=(128, ), name="input_token", dtype="int32") input_masks_ids = tf.keras.layers.Input(shape=(128, ), name="masked_token", dtype="int32") X = transformer_model(input_ids, input_masks_ids) model = tf.keras.Model(inputs=[input_ids, input_masks_ids], outputs=X) return model
def create_model(max_seq_len, classes): config = DistilBertConfig(dropout=0.2, attention_dropout=0.2) config.output_hidden_states = False tfm = TFDistilBertModel.from_pretrained('./MODEL/uncased/', config=config) input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids") bert_output = tfm(input_ids)[0] cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output) cls_out = keras.layers.Dropout(0.5)(cls_out) logits = keras.layers.Dense(units=768, activation="tanh")(cls_out) logits = keras.layers.Dropout(0.5)(logits) logits = keras.layers.Dense(units=512, activation="tanh")(cls_out) logits = keras.layers.Dropout(0.5)(logits) logits = keras.layers.Dense(units=256, activation="tanh")(logits) logits = keras.layers.Dropout(0.5)(logits) logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits) model = keras.Model(inputs=input_ids, outputs=logits) model.build(input_shape=(None, max_seq_len)) return model
def run_model(pos_train_file, neg_train_file, pos_dev_file, neg_dev_file, nrows_train, nrows_dev, epochs, out_dir): batch_size = 16 #x_train = _read_data('../data/train_bal.csv', nrows_train) #x_dev = _read_data('../data/dev_bal.csv', nrows_dev) #train_data = list( zip( x_train['comment_text'].values, x_train['target'].values )) #train_dataloader = DataLoader( train_data, # collate_fn=my_collate, # batch_size=batch_size , shuffle=True, ) # # #dev_data = list( zip( x_dev['comment_text'].values, x_dev['target'].values )) #dev_dataloader = DataLoader( dev_data, # collate_fn=my_collate, # batch_size=batch_size, shuffle=False, ) train_dataloader = get_data_loader_bal(pos_train_file, neg_train_file, batch_size=batch_size, nrows_pos=nrows_train, nrows_neg=nrows_train, mode='train') dev_dataloader = get_data_loader_bal(pos_dev_file, neg_dev_file, batch_size=batch_size, nrows_pos=nrows_dev, nrows_neg=nrows_dev, mode='dev') device = get_device() bert_hidden_states = 4 config = DistilBertConfig() config.output_hidden_states = True model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels= 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) model = model.to(device) optimizer = AdamW( model.parameters(), lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) if not os.path.exists(out_dir): os.makedirs(out_dir) best_score = -np.inf stats_vec = [] for epoch in range(epochs): stats = train_epoch(model, train_dataloader, dev_dataloader, optimizer, scheduler) print(stats) if stats['accuracy'] > best_score: best_score = stats['accuracy'] f = out_dir + '/' + 'best_model_ch.pt' torch.save({ 'epoch': epoch, 'model': model, 'stats': stats, }, f) stats_vec.append(stats) stats_vec = pd.DataFrame(stats_vec) f = out_dir + '/' + 'last_model_ch.pt' torch.save({ 'epoch': epoch, 'model': model, 'stats': stats, }, f) print(stats_vec) stats_vec.to_csv(out_dir + '/' + 'stats.csv')
dataset_test=tf.data.Dataset.from_tensor_slices((Xids_test,Xmask_test)) def map_func(input_ids,mask): return {'input_ids':input_ids,'attention_mask':mask} dataset_test=dataset_test.map(map_func) dataset_test=dataset_test.batch(32).prefetch(1000) #Build the model from transformers import TFDistilBertModel, DistilBertConfig distil_bert = 'distilbert-base-uncased' config = DistilBertConfig(dropout=0.2, attention_dropout=0.2) config.output_hidden_states = False transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config = config) input_ids_in = tf.keras.layers.Input(shape=(SEQ_length,), name='input_ids', dtype='int32') input_masks_in = tf.keras.layers.Input(shape=(SEQ_length,), name='attention_mask', dtype='int32') embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0] X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer) X = tf.keras.layers.GlobalMaxPool1D()(X) X = tf.keras.layers.Dense(50, activation='relu')(X) X = tf.keras.layers.Dropout(0.2)(X) X = tf.keras.layers.Dense(1, activation='sigmoid')(X) model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X) for layer in model.layers[:3]: layer.trainable = False