def tl_disbert_model(param={}): trainable = param['Trainable'] max_seq_len = param['Max_length'] inputs = Input(shape= (max_seq_len,), dtype ='int64', name='inputs') masks = Input(shape = (max_seq_len,), dtype='int64', name='masks') disBert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased') disBert_model.trainable = param['Trainable'] disBert_output = disBert_model(inputs, attention_mask = masks) disBert_last_hidden = disBert_output.last_hidden_state disBert_CLS_output = disBert_last_hidden [:,0,:] x = Flatten()(disBert_CLS_output) x = LayerNormalization()(x) x = Dense(param['first_layer'], activation='relu')(x) x = Dropout(param['dropout'])(x) x = LayerNormalization()(x) x = Dense(param['second_layer'], activation='relu')(x) x = Dropout(param['dropout'])(x) probs = Dense(3, activation='softmax')(x) model = keras.Model(inputs = [inputs, masks], outputs=probs) model.summary() return model
def _create_sentence_transformer(self, input_shape): input_ids = tf.keras.Input(shape=input_shape, name='input_ids', dtype=tf.int32) attention_mask = tf.keras.Input(shape=input_shape, name='attention_mask', dtype=tf.int32) transformer_model = TFDistilBertModel.from_pretrained(self.model_name, config = self.model_config) word_embedding_layer = transformer_model([input_ids, attention_mask])[0] sentence_embedding_layer = PoolingLayer(pooling_type="mean")([word_embedding_layer, attention_mask]) return tf.keras.Model([input_ids, attention_mask], sentence_embedding_layer)
def build_model(): """ This model is build based upon the DistilBert model taken from the Huggingface's Transformer library The model has to be compiled before weight loading. """ pretrained_model = TFDistilBertModel.from_pretrained( 'distilbert-base-uncased', output_attentions=False ) input_ids = tf.keras.layers.Input( shape=(None,), dtype=tf.int32, name='input_ids_pl' ) attention_mask = tf.keras.layers.Input( shape=(None,), dtype=tf.int32, name='attention_mask_pl' ) # get the output of the '[CLS'] token on the last layer bert_output = pretrained_model( {'input_ids': input_ids, 'attention_mask': attention_mask}, return_dict=True )['last_hidden_state'][:, 0] pre_classification = tf.keras.layers.Dense(128, activation='tanh')(bert_output) dropout_1 = tf.keras.layers.Dropout(0.3)(pre_classification) classification_output = tf.keras.layers.Dense(2, activation='softmax')(dropout_1) model = tf.keras.models.Model( inputs=[input_ids, attention_mask], outputs=classification_output ) return model
def extract_embeddings_for_other_clf(): distil_bert = "distilbert-base-uncased" config = DistilBertConfig(dropout=0.2, attention_dropout=0.2) config.output_hidden_states = False transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config=config) input_ids_in = tf.keras.layers.Input(shape=(25, ), name="input_token", dtype="int32") input_masks_in = tf.keras.layers.Input(shape=(25, ), name="masked_token", dtype="int32") embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0] cls_token = embedding_layer[:, 0, :] X = tf.keras.layers.BatchNormalization()(cls_token) X = tf.keras.layers.Dense(192, activation="relu")(X) X = tf.keras.layers.Dropout(0.2)(X) X = tf.keras.layers.Dense(3, activation="softmax")(X) model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=X) for layer in model.layers[:3]: layer.trainable = False return model
def __init__( self, pretrained_model_name_or_path='distilbert-base-uncased', reduce_output='sum', trainable=True, num_tokens=None, **kwargs ): super(DistilBERTEncoder, self).__init__() try: from transformers import TFDistilBertModel except ModuleNotFoundError: logger.error( ' transformers is not installed. ' 'In order to install all text feature dependencies run ' 'pip install ludwig[text]' ) sys.exit(-1) self.transformer = TFDistilBertModel.from_pretrained( pretrained_model_name_or_path ) self.reduce_output = reduce_output self.reduce_sequence = SequenceReducer(reduce_mode=reduce_output) self.transformer.trainable = trainable self.transformer.resize_token_embeddings(num_tokens)
def run_distilibert(strategy: tf.distribute.TPUStrategy, x_train: np.array, x_valid: np.array, _y_train: np.array, y_valid: np.array, train_dataset: tf.data.Dataset, valid_dataset: tf.data.Dataset, test_dataset: tf.data.Dataset, max_len: int, epochs: int, batch_size: int) -> tf.keras.models.Model: """ create and run distilbert on training and testing data """ logger.info('build distilbert') with strategy.scope(): transformer_layer = TFDistilBertModel.from_pretrained(MODEL) model = build_model(transformer_layer, max_len=max_len) model.summary() # train given model n_steps = x_train.shape[0] // batch_size history = model.fit(train_dataset, steps_per_epoch=n_steps, validation_data=valid_dataset, epochs=epochs) plot_train_val_loss(history, 'distilbert') n_steps = x_valid.shape[0] // batch_size _train_history_2 = model.fit(valid_dataset.repeat(), steps_per_epoch=n_steps, epochs=epochs * 2) scores = model.predict(test_dataset, verbose=1) logger.info(f"AUC: {roc_auc(scores, y_valid):.4f}") return model
def distilbert_model(input_shape, transformer_model, output_shape=96, output_activation='softmax', optimizer='Adam', optimizer_params={'lr': 1e-5}, loss='categorical_crossentropy', metrics=None): input_ids = Input((input_shape, ), dtype=tf.int32) input_mask = Input((input_shape, ), dtype=tf.int32) transformer_encoder = TFDistilBertModel.from_pretrained( transformer_model, from_pt=True, output_hidden_states=True) outputs = transformer_encoder.distilbert(input_ids, attention_mask=input_mask) x = outputs[0] x = GlobalAveragePooling1D()(x) output = Dense(output_shape, activation=output_activation)(x) model = Model(inputs=[input_ids, input_mask], outputs=output) model.compile(loss=loss, metrics=metrics, optimizer=getattr(optimizers, optimizer)(**optimizer_params)) return model
def create_model(model_config: CommentClassifierConfig, saved_weights_path: str = None, max_seq_length: int = MAX_SEQ_LENGTH) -> tf.keras.Model: """ :param model_config: CommentClassifierConfig :param saved_weights_path: If defined, model weights will be loaded from the provided checkpoint path :param max_seq_length: Maximum length of the tokenized input to BERT :return: Model for text classification using DistilBert transformers """ # Load pre-trained DistilBERT bert_config = DistilBertConfig( dropout=model_config.bert_dropout, attention_dropout=model_config.bert_attention_dropout, num_labels=NUM_CLASSES) bert_config.output_hidden_states = False transformer_model = TFDistilBertModel.from_pretrained(MODEL_NAME, config=bert_config) input_ids_in = tf.keras.layers.Input(shape=(max_seq_length, ), name='input_token', dtype='int32') input_masks_in = tf.keras.layers.Input(shape=(max_seq_length, ), name='masked_token', dtype='int32') embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0] x = tf.keras.layers.Bidirectional( tf.keras.layers.LSTM( model_config.lstm_units, return_sequences=True, dropout=model_config.lstm_dropout, recurrent_dropout=model_config.lstm_recurrent_dropout))( embedding_layer) x = tf.keras.layers.GlobalMaxPool1D()(x) x = tf.keras.layers.Dense( model_config.hidden_layer_dim, activation=model_config.hidden_layer_activation)(x) x = tf.keras.layers.Dropout(model_config.final_layer_dropout)(x) x = tf.keras.layers.Dense( NUM_CLASSES, activation=model_config.final_layer_activation)(x) model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=x) # Use transfer learning only - do not train BERT again for layer in model.layers[:3]: layer.trainable = False # Load weights from a checkpoint, but allow partial matching # (e.g. due to a change in the optimizer) if saved_weights_path is not None: model.load_weights(saved_weights_path).expect_partial() return model
def run_bert_meta_regression_tfmodel(): """ Run self defined combined model.""" timestamp = datetime.now().strftime('%Y%m%d-%H%M%S') log_dir = os.path.join(os.getenv('OUTPUT_DIR'), timestamp) model_plot = f'regression_model_{timestamp}.png' tokenizer = AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME')) config = AutoConfig.from_pretrained(os.getenv('MODEL_NAME'), num_labels=1) distilebert_model = TFDistilBertModel.from_pretrained( os.getenv('MODEL_NAME'), config=config) print(config, tokenizer, sep='\n') # tf.keras.utils.plot_model(distilebert_model, to_file=model_plot, show_shapes=True) tc = TopCoder() encoded_text = tc.get_bert_encoded_txt_features(tokenizer) metadata = tc.get_meta_data_features(encoded_tech=True, softmax_tech=True) target = tc.get_target() split = int((4 / 5) * len(target)) dataset = tf.data.Dataset.from_tensor_slices( (dict(**encoded_text, meta_input=metadata), target)) dataset = dataset.shuffle(len(target)) train_ds, test_ds = dataset.take(split).batch(16), dataset.skip( split).batch(8) print(train_ds, test_ds, sep='\n') # for i in train_ds.take(2): # pprint(i) # print() # for i in test_ds.take(2): # pprint(i) # model = TCPMDistilBertRegression.from_pretrained(os.getenv('MODEL_NAME'), config=config) tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) model = build_tcpm_model_distilbert_regression(distilebert_model) model.summary() model.compile(optimizer=tf.keras.optimizers.Adam(2e-6), loss='mse', metrics=['mae', 'mse', mre]) history = model.fit( train_ds, epochs=12, ) result = model.evaluate( test_ds, return_dict=True, ) pprint(result) history_df = pd.DataFrame(history.history) history_df.to_json(os.path.join(log_dir, 'train_history.json'), orient='index', indent=4) with open(os.path.join(log_dir, 'result.json'), 'w') as f: json.dump(result, f, indent=4)
def test_TFDistilBertModel(self): from transformers import DistilBertTokenizer, TFDistilBertModel pretrained_weights = 'distilbert-base-uncased' tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFDistilBertModel.from_pretrained(pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def get_transformer(bert_model_type, output_hidden_states=False): config = get_bert_config(bert_model_type, output_hidden_states) if bert_model_type in [ 'bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: return TFBertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in [ 'prod-bert-base-uncased', 'tune_bert-base-uncased_nsp' ]: return TFBertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config, from_pt=True) elif bert_model_type in [ 'roberta-base', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: return TFRobertaModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in ['prod-roberta-base-cased']: return TFRobertaModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config, from_pt=True) elif bert_model_type in ['xlnet-base-cased']: return TFXLNetModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: return TFAlbertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in ['gpt2', 'gpt2-medium']: return TFGPT2Model.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in ['transfo-xl']: return TFTransfoXLModel.from_pretrained( BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: return TFDistilBertModel.from_pretrained( BERT_MODEL_FILE[bert_model_type], config=config) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}')
def model_arch_multitask(): num_labels = 2 bert = TFDistilBertModel.from_pretrained("distilbert-base-cased") dropout = tf.keras.layers.Dropout(0.4) answer_logits = tf.keras.layers.Dense(10, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), name="answer_logits", activation = "softmax") classifier = tf.keras.layers.Dense(2, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), name="seq_logits") input_ids = tf.keras.layers.Input(shape = (None,), dtype=tf.int32) attention_mask = tf.keras.layers.Input(shape = (None,), dtype=tf.int32) outputs = bert([input_ids, attention_mask]) answer_output = answer_logits(outputs[0][:, 0, :]) sequence_output = outputs[0] sequence_output = dropout(sequence_output) logits = classifier(sequence_output) model = tf.keras.models.Model(inputs = [input_ids, attention_mask], outputs = [logits, answer_output]) model.compile(loss={'seq_logits': custom_loss_logits, 'answer_logits': "categorical_crossentropy"}, optimizer=optimizer, loss_weights = {"answer_logits": 1.0, "seq_logits": 1.0}, metrics=["accuracy"]) return model
def get_transformer(LM: bool): if LM: if model_name == 'distilbert-base-cased': model = TFDistilBertForMaskedLM.from_pretrained( 'distilbert-base-cased') elif model_name == 'huggingface/CodeBERTa-small-v1': model = AutoModelWithLMHead.from_pretrained( 'huggingface/CodeBERTa-small-v1') model = pt_to_tf(model, TFRobertaForMaskedLM) else: if model_name == 'distilbert-base-cased': model = TFDistilBertModel.from_pretrained( 'distilbert-base-cased', ) elif model_name == 'huggingface/CodeBERTa-small-v1': model = AutoModel.from_pretrained('huggingface/CodeBERTa-small-v1') model = pt_to_tf(model, TFRobertaModel) return model
def create_model(max_seq_len, classes): config = DistilBertConfig(dropout=0.2, attention_dropout=0.2) config.output_hidden_states = False tfm = TFDistilBertModel.from_pretrained('./MODEL/uncased/', config=config) input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids") bert_output = tfm(input_ids)[0] cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output) cls_out = keras.layers.Dropout(0.5)(cls_out) logits = keras.layers.Dense(units=768, activation="tanh")(cls_out) logits = keras.layers.Dropout(0.5)(logits) logits = keras.layers.Dense(units=512, activation="tanh")(cls_out) logits = keras.layers.Dropout(0.5)(logits) logits = keras.layers.Dense(units=256, activation="tanh")(logits) logits = keras.layers.Dropout(0.5)(logits) logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits) model = keras.Model(inputs=input_ids, outputs=logits) model.build(input_shape=(None, max_seq_len)) return model
def model_arch_tok_classification(): num_labels = 2 max_len = 128 bert = TFDistilBertModel.from_pretrained("distilbert-base-cased") dropout = tf.keras.layers.Dropout(0.4) classifier = tf.keras.layers.Dense(2, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), name="seq_logits") question_input_ids = tf.keras.layers.Input(shape = (max_len,), dtype=tf.int32) question_attention_mask = tf.keras.layers.Input(shape = (max_len,), dtype=tf.int32) question_output = bert([question_input_ids, question_attention_mask]) question_output = question_output[0][:, 0, :] question_output = tf.keras.layers.RepeatVector(max_len)(question_output) context_input_ids = tf.keras.layers.Input(shape = (max_len,), dtype=tf.int32) context_attention_mask = tf.keras.layers.Input(shape = (max_len,), dtype=tf.int32) outputs = bert([context_input_ids, context_attention_mask]) sequence_output = outputs[0] sequence_output = tf.keras.layers.concatenate([sequence_output, question_output], axis = -1) sequence_output = dropout(sequence_output) logits = classifier(sequence_output) model = tf.keras.models.Model(inputs = [question_input_ids, question_attention_mask, context_input_ids, context_attention_mask], outputs = logits) model.compile(loss=custom_loss_logits, optimizer=optimizer) return model
def initialize_hugface_model(hugging_face_model): # if hugging_face_model == "xlnet": # tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') # model = TFXLNetModel.from_pretrained('xlnet-base-cased') # elif hugging_face_model == "roberta": # tokenizer = RobertaTokenizer.from_pretrained('roberta-base') # model = TFRobertaModel.from_pretrained('roberta-base') # elif hugging_face_model == "ernie": # tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-en") # model = TFAutoModel.from_pretrained("nghuyong/ernie-2.0-en") #FAST TOKENIZERS if hugging_face_model == "distilbert": tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") model = TFDistilBertModel.from_pretrained("distilbert-base-uncased") elif hugging_face_model == "bert": tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased') model = TFBertModel.from_pretrained('bert-base-cased') else: raise ValueError('Invalid embedding type') return tokenizer, model
(x_valid, y_valid)).batch(BATCH_SIZE).cache().prefetch(AUTO)) # test_dataset = ( # tf.data.Dataset # .from_tensor_slices(x_test) # .batch(BATCH_SIZE) # ) ################################################################### # LOAD MODEL ################################################################### print("loading model ...") with strategy.scope(): #transformer_layer = TFAutoModel.from_pretrained(MODEL) transformer_layer = TFDistilBertModel.from_pretrained(MODEL) model = HelperFns.build_model(transformer_layer, max_len=MAX_LEN) model.summary() ################################################################### # TRAINING ################################################################### print("run training ...") n_steps = x_train.shape[0] // BATCH_SIZE train_history = model.fit(train_dataset, steps_per_epoch=n_steps, validation_data=valid_dataset, epochs=EPOCHS)
def build_custom_model(self, validation_data, validation_label): # -------------------------------------------------------------------------------- # Input layer (token indices and attention masks) # -------------------------------------------------------------------------------- input_ids = tf.keras.layers.Input(shape=(self.max_sequence_length, ), dtype=tf.int32, name='input_ids') attention_mask = tf.keras.layers.Input((self.max_sequence_length, ), dtype=tf.int32, name='attention_mask') # -------------------------------------------------------------------------------- # Base layer # -------------------------------------------------------------------------------- # TFBaseModelOutput.last_hidden_state has shape (batch_size, max_sequence_length, 768) # Each sequence has [CLS]...[SEP] structure of shape (max_sequence_length, 768) # Extract [CLS] embeddings of shape (batch_size, 768) as last_hidden_state[:, 0, :] # -------------------------------------------------------------------------------- base = TFDistilBertModel.from_pretrained(self.model_name, ) # Freeze the base model weights. if self.freeze_pretrained_base_model: for layer in base.layers: layer.trainable = False base.summary() output = base([input_ids, attention_mask]).last_hidden_state[:, 0, :] # -------------------------------------------------------------------------------- # TODO: # Need to verify the effect of regularizers. # # [bias regularizer] # It looks bias_regularizer adjusts the ROC threshold towards 0.5. # Without it, the threshold of the ROC with BinaryCrossEntropy loss was approx 0.02. # With it, the threshold of the ROC with BinaryCrossEntropy loss was approx 0.6. # -------------------------------------------------------------------------------- activation = "sigmoid" if self.num_labels == 1 else "softmax" output = tf.keras.layers.Dense( units=self.num_labels, kernel_initializer='glorot_uniform', # https://huggingface.co/transformers/v4.3.3/main_classes/optimizer_schedules.html#adamweightdecay-tensorflow # kernel_regularizer=tf.keras.regularizers.l2(l2=self.l2), # bias_regularizer=tf.keras.regularizers.l2(l2=self.l2), # activity_regularizer=tf.keras.regularizers.l2(l2=self.l2/10.0), activation=activation, name=activation)(output) # -------------------------------------------------------------------------------- # Loss layer # -------------------------------------------------------------------------------- if self.num_labels == 1: # Binary classification loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False) else: # Categorical classification loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=False) # -------------------------------------------------------------------------------- # Model Metrics # -------------------------------------------------------------------------------- if self.num_labels == 1: if self.USE_METRIC_AUC: # ROC/AUC # AUC is for Binary Classification. Error if used for categorical" # "alueError: Shapes (None, <num_classes>) and (None, 1) are incompatible" # Because AUC is expecting shape(None, 1) as binary input into the loss fn. self._metric_name = "auc" self._monitor_metric = f"val_{self._metric_name}" self._monitor_mode = 'max' self._metrics = [ tf.keras.metrics.AUC(from_logits=False, name=self._metric_name), tf.keras.metrics.Recall(name="recall"), "accuracy" ] self._callbacks = self.build_custom_model_auc_callbacks( validation_data, validation_label) else: self._metric_name = "recall" # Recall self._monitor_metric = f"val_{self._metric_name}" self._monitor_mode = 'max' self._metrics = [ tf.keras.metrics.Recall(name=self._metric_name), "accuracy" ] self._callbacks = self.build_custom_model_acc_callbacks() else: # Validation loss self._metric_name = "accuracy" self._monitor_metric = "val_loss" self._monitor_mode = 'min' # metrics=[tf.keras.metrics.Accuracy(name=metric_name)] self._metrics = [self._metric_name] self._callbacks = self.build_custom_model_acc_callbacks() # -------------------------------------------------------------------------------- # Build model # -------------------------------------------------------------------------------- # TODO: Replace TIMESTAMP with instance variable name = f"{TIMESTAMP}_{self.model_name.upper()}" self._model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output, name=name) self.model.compile( # https://huggingface.co/transformers/v4.3.3/main_classes/optimizer_schedules.html#adamweightdecay-tensorflow # optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate), optimizer=transformers.AdamWeightDecay( learning_rate=self.learning_rate), loss=loss_fn, metrics=self._metrics) # -------------------------------------------------------------------------------- # Load model parameters if the saved weight file exits # -------------------------------------------------------------------------------- path_to_h5 = self.model_directory + os.path.sep + "model.h5" if os.path.isfile(path_to_h5) and os.access(path_to_h5, os.R_OK): print( f"\nloading the saved model parameters from {path_to_h5}...\n") self.model.load_weights(path_to_h5)
dataset_test=tf.data.Dataset.from_tensor_slices((Xids_test,Xmask_test)) def map_func(input_ids,mask): return {'input_ids':input_ids,'attention_mask':mask} dataset_test=dataset_test.map(map_func) dataset_test=dataset_test.batch(32).prefetch(1000) #Build the model from transformers import TFDistilBertModel, DistilBertConfig distil_bert = 'distilbert-base-uncased' config = DistilBertConfig(dropout=0.2, attention_dropout=0.2) config.output_hidden_states = False transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config = config) input_ids_in = tf.keras.layers.Input(shape=(SEQ_length,), name='input_ids', dtype='int32') input_masks_in = tf.keras.layers.Input(shape=(SEQ_length,), name='attention_mask', dtype='int32') embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0] X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer) X = tf.keras.layers.GlobalMaxPool1D()(X) X = tf.keras.layers.Dense(50, activation='relu')(X) X = tf.keras.layers.Dropout(0.2)(X) X = tf.keras.layers.Dense(1, activation='sigmoid')(X) model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X) for layer in model.layers[:3]: layer.trainable = False
data = pd.read_csv('/content/drive/My Drive/data/train_E6oV3lV.csv').sample(frac=0.3) X_train, X_test, y_train, y_test = train_test_split(data.tweet, data.label) model = Pipeline([ ('vect', TfidfVectorizer( stop_words='english', # ngram_range=(1, 3) )), ('clf', SGDClassifier()), ]) model.fit(X_train, y_train) print('tfidf f1:', f1_score(y_test, model.predict(X_test), average='binary')) model = TFDistilBertModel.from_pretrained('distilbert-base-uncased') tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', # pad_to_max_length=True, # max_length=100000 ) pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer) features = pipe(X_train.to_list(), # pad_to_max_length=True ) print() # model = TFDistilBertModel.from_pretrained('distilbert-base-uncased') # tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') # pipe = pipeline('feature-extraction', model=model,
0: 1, 1: 2, 2: 3, 3: 4, 4: 5 }, label2id={ 1: 0, 2: 1, 3: 2, 4: 3, 5: 4 }, ) transformer_model = TFDistilBertModel.from_pretrained( "distilbert-base-uncased", config=config) input_ids = tf.keras.layers.Input(shape=(max_seq_length, ), name="input_ids", dtype="int32") input_mask = tf.keras.layers.Input(shape=(max_seq_length, ), name="input_mask", dtype="int32") embedding_layer = transformer_model.distilbert( input_ids, attention_mask=input_mask)[0] X = tf.keras.layers.Bidirectional( tf.keras.layers.LSTM( 50, return_sequences=True, dropout=0.1,
from train import train, trainLiar, trainPoliti from tensorflow.keras import regularizers, initializers, optimizers, callbacks from tensorflow.keras.layers import * import pandas as pd import numpy as np import keras import tensorflow as tf from transformers import TFDistilBertModel, DistilBertConfig # DistilBERT config = DistilBertConfig(dropout=0.2, attention_dropout=0.2, output_hidden_states=True) dbert_model = TFDistilBertModel.from_pretrained( 'distilbert-base-uncased', config=config) # Define model using Keras functional API def buildModel(seq_length, md_length, sco_length, his_length, n_output1, n_output2): input_ids1 = Input(shape=(seq_length, ), dtype=tf.int32, name="input_ids1") attention_mask1 = Input(shape=(seq_length, ), dtype=tf.int32, name="attention_mask1") input_ids2 = Input(shape=(md_length, ), dtype=tf.int32, name="input_ids2") attention_mask2 = Input(shape=(md_length, ), dtype=tf.int32, name="attention_mask2") score = Input(shape=(sco_length,), name="score") history = Input(shape=(his_length,), name="history")
import numpy as np import pandas as pd import tensorflow as tf import transformers from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score import re from transformers import TFDistilBertModel, DistilBertTokenizer model_name = 'distilbert-base-uncased' pretrained_model = TFDistilBertModel.from_pretrained(model_name) tokenizer = DistilBertTokenizer.from_pretrained(model_name) train_path = 'data\\train.csv' test_path = 'data\\test.csv' input_column = 'tweet' label_column = 'label' # ============= load dataset =============== train_df = pd.read_csv(train_path) test_df = pd.read_csv(test_path) # --- get classes and convert to categorical --- label_cols = train_df[label_column].unique() if type(label_cols) == np.ndarray: train_df["target"] = train_df[label_column] # train_df.pop(label_column) num_classes = label_cols.shape[0] else: num_classes = len(label_column)
def feature_extracter_from_texts(self, mashup_api=None): """ 对mashup,service的description均需要提取特征,右路的文本的整个特征提取过程 公用的话应该封装成新的model! :param mashup_api: 默认是None,只有'HDP'/'Bert'时为非空 :return: 输出的是一个封装好的model,所以可以被mashup和api公用 """ if self.args.text_extracter_mode in fixed_vector_modes and mashup_api is not None: if self.args.text_extracter_mode == 'Bert': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bertModel = BertModel.from_pretrained("bert-base-uncased") if mashup_api == 'mashup': if self.mashup_text_feature_extracter is None: # 没求过 mashup_texts = get_iterable_values( data_repository.get_md().mashup_df, 'final_description', return_ele_type='str') dense_mashup_features = bertModel( tokenizer(mashup_texts, return_tensors='tf')) self.mashup_text_feature_extracter = vector_feature_extracter_from_texts( 'mashup', dense_mashup_features) return self.mashup_text_feature_extracter elif mashup_api == 'api': if self.api_text_feature_extracter is None: api_texts = get_iterable_values( data_repository.get_md().api_df, 'final_description', return_ele_type='str') dense_api_features = bertModel( tokenizer(api_texts, return_tensors='tf')) self.api_text_feature_extracter = vector_feature_extracter_from_texts( 'api', dense_api_features) return self.api_text_feature_extracter else: raise TypeError('wrong mashup_api mode!') else: if self.gd is None: self.gd = get_default_gd( tag_times=0, mashup_only=False, strict_train=True) # 用gensim处理文本,文本中不加tag self.gd.model_pcs(self.args.text_extracter_mode) # if mashup_api == 'mashup': if self.mashup_text_feature_extracter is None: # 没求过 self.mashup_text_feature_extracter = vector_feature_extracter_from_texts( 'mashup', self.gd.dense_mashup_features) return self.mashup_text_feature_extracter elif mashup_api == 'api': if self.api_text_feature_extracter is None: self.api_text_feature_extracter = vector_feature_extracter_from_texts( 'api', self.gd.dense_api_features) return self.api_text_feature_extracter else: raise TypeError('wrong mashup_api mode!') elif self.text_feature_extracter is None: # 没求过 if 'trainable_bert' in self.args.text_extracter_mode.lower(): self.text_feature_extracter = TFDistilBertModel.from_pretrained( "distilbert-base-uncased") # layer if self.args.frozen_bert: self.text_feature_extracter.trainable = False else: text_input = Input(shape=(self.args.MAX_SEQUENCE_LENGTH, ), dtype='int32') text_embedding_layer = self.get_text_embedding_layer( ) # 参数还需设为外部输入! text_embedded_sequences = text_embedding_layer( text_input) # 转化为2D if self.args.text_extracter_mode in ( 'inception', 'textCNN'): # 2D转3D,第三维是channel # print(text_embedded_sequences.shape) text_embedded_sequences = Lambda( lambda x: tf.expand_dims(x, axis=3))( text_embedded_sequences) # tf 和 keras的tensor 不同!!! print(text_embedded_sequences.shape) if self.args.text_extracter_mode == 'inception': x = inception_layer( text_embedded_sequences, self.args.embedding_dim, self.args.inception_channels, self.args.inception_pooling) # inception处理 print('built inception layer, done!') elif self.args.text_extracter_mode == 'textCNN': x = textCNN_feature_extracter_from_texts( text_embedded_sequences, self.args) elif self.args.text_extracter_mode == 'LSTM': x = LSTM_feature_extracter_from_texts( text_embedded_sequences, self.args) else: raise TypeError('wrong extracter!') print('text feature after inception/textCNN/LSTM whole_model,', x) # 观察MLP转化前,模块输出的特征 for FC_unit_num in self.args.inception_fc_unit_nums: x = Dense(FC_unit_num, kernel_regularizer=l2(self.args.l2_reg))( x) # , activation='relu' if self.args.inception_MLP_BN: x = BatchNormalization(scale=False)(x) x = PReLU()(x) # if self.args.inception_MLP_dropout: x = tf.keras.layers.Dropout(0.5)(x) self.text_feature_extracter = Model( text_input, x, name='text_feature_extracter') return self.text_feature_extracter