def test_TFDistilBertForTokenClassification(self): from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification pretrained_weights = 'distilbert-base-uncased' tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFDistilBertForTokenClassification.from_pretrained( pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
# val_labels = val_tags import tensorflow as tf train_encodings.pop( "offset_mapping") # we don't want to pass this to the model val_encodings.pop("offset_mapping") train_dataset = tf.data.Dataset.from_tensor_slices( (dict(train_encodings), train_labels)) val_dataset = tf.data.Dataset.from_tensor_slices( (dict(val_encodings), val_labels)) #Use tensorflow to train and evaluate from transformers import TFDistilBertForTokenClassification model = TFDistilBertForTokenClassification.from_pretrained( 'distilbert-base-cased', num_labels=len(unique_tags)) optimizer = tf.keras.optimizers.Adam(learning_rate=0.01) model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=["accuracy"]) # can also use any keras loss fn history = model.fit(train_dataset.shuffle(100).batch(16), epochs=3, batch_size=16) # model.save("E:\Projects\A_Idiom_detection_gihan\idiom_detection_nlp\models\model_files\\") # import tensorflow as tf # model = tf.keras.models.load_model("E:\Projects\A_Idiom_detection_gihan\idiom_detection_nlp\models\model_files\\") # Evaluate the model on the test data using `evaluate` model_config = { 'model': "TFDistilBertForTokenClassification_W_NUT",
def use(self): if self.model_type == "classification": train_texts, train_labels = self.read_split(f"{self.path}/train") train_texts, val_texts, train_labels, val_labels = train_test_split( train_texts, train_labels, test_size=0.2) tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') train_encodings = tokenizer(train_texts, truncation=True, padding=True) val_encodings = tokenizer(val_texts, truncation=True, padding=True) train_dataset = tf.data.Dataset.from_tensor_slices( (dict(train_encodings), train_labels)) val_dataset = tf.data.Dataset.from_tensor_slices( (dict(val_encodings), val_labels)) model = TFDistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased") if self.model_type == "token_classification": texts, tags = self.read_wnut(self.path) train_texts, val_texts, train_tags, val_tags = train_test_split( texts, tags, test_size=.2) unique_tags = set(tag for doc in tags for tag in doc) tag2id = {tag: id for id, tag in enumerate(unique_tags)} tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-cased') train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True) val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True) train_labels = self.encode_tags(train_tags, train_encodings, tag2id) val_labels = self.encode_tags(val_tags, val_encodings, tag2id) train_encodings.pop("offset_mapping") val_encodings.pop("offset_mapping") train_dataset = tf.data.Dataset.from_tensor_slices( (dict(train_encodings), train_labels)) val_dataset = tf.data.Dataset.from_tensor_slices( (dict(val_encodings), val_labels)) model = TFDistilBertForTokenClassification.from_pretrained( 'distilbert-base-cased', num_labels=len(unique_tags)) if self.model_type == "q+a": train_contexts, train_questions, train_answers = self.read_squad( f"{self.path}/train-v2.0.json") val_contexts, val_questions, val_answers = self.read_squad( f"{self.path}/dev-v2.0.json") self.add_end_idx(train_answers, train_contexts) self.add_end_idx(val_answers, val_contexts) tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True) val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True) self.add_token_positions(train_encodings, train_answers) self.add_token_positions(val_encodings, val_answers) train_dataset = tf.data.Dataset.from_tensor_slices(({ key: train_encodings[key] for key in ['input_ids', 'attention_mask'] }, { key: train_encodings[key] for key in ['start_positions', 'end_positions'] })) val_dataset = tf.data.Dataset.from_tensor_slices(({ key: val_encodings[key] for key in ['input_ids', 'attention_mask'] }, { key: val_encodings[key] for key in ['start_positions', 'end_positions'] })) model = TFDistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") train_dataset = train_dataset.map( lambda x, y: (x, (y['start_positions'], y['end_positions']))) model.distilbert.return_dict = False optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5) model.compile(optimizer=optimizer, loss=model.compute_loss) model.fit(train_dataset.shuffle(1000).batch(self.batch_size), validation_data=val_dataset, epochs=self.epochs, batch_size=self.batch_size) try: os.mkdir(f"{self.save}") model.save_pretrained(self.save) except OSError: model.save_pretrained(self.save)
# print(pd.DataFrame(nlp(sentence))) # gives out tokens and labes sentence = 'Apple and Microsoft plan to form a joint venture for the development of cloud-based computing ' \ 'infrastrucutre.' ## BERT tokenizer and token classification nlp = TokenClassificationPipeline(model=TFAutoModelForTokenClassification.from_pretrained( 'distilbert-base-cased'), tokenizer=AutoTokenizer.from_pretrained('distilbert-base-cased'), framework='tf') print(pd.DataFrame(nlp(sentence))) from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification import tensorflow as tf tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1 print(model(input_ids)) import numpy as np from transformers import AutoTokenizer, pipeline, TFDistilBertModel tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased') model = TFDistilBertModel.from_pretrained('distilbert-base-cased') pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer) features = pipe('any text data or list of text data', pad_to_max_length=True) features = np.squeeze(features) ## Sentence classification