def test_TFRobertaForSequenceClassification(self): from transformers import RobertaConfig, TFRobertaForSequenceClassification keras.backend.clear_session() # pretrained_weights = 'roberta-base' tokenizer_file = 'roberta_roberta-base.pickle' tokenizer = self._get_tokenzier(tokenizer_file) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) config = RobertaConfig() model = TFRobertaForSequenceClassification(config) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def getBertModel(): # def f1(y_true, y_pred): # def recall(y_true, y_pred): # true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) # recall = true_positives / (possible_positives + K.epsilon()) # return recall # def precision(y_true, y_pred): # true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) # precision = true_positives / (predicted_positives + K.epsilon()) # return precision # precision = precision(y_true, y_pred) # recall = recall(y_true, y_pred) # return 2*((precision*recall)/(precision+recall+K.epsilon())) bertModel = TFRobertaForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=len(PROP_CLASS)) optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0) # loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') # loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True) # metric = tf.keras.metrics.CategoricalAccuracy('categorical_accuracy') loss = "binary_crossentropy" metric = "accuracy" # bertModel.compile(optimizer=optimizer, loss=loss, metrics=[metric, f1]) bertModel.compile(optimizer=optimizer, loss=loss, metrics=[metric]) return bertModel
def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs): if model_name not in ModelsByFamily.Supported: raise ValueError(f'Model {model_name} not supported.') do_lower_case = False if 'uncased' in model_name.lower(): do_lower_case = True tokenizer_kwargs.update({'do_lower_case': do_lower_case}) self._tokenizer = None self._model = None if model_name in ModelsByFamily.Bert: self._tokenizer = BertTokenizer.from_pretrained( model_name, **tokenizer_kwargs) self._model = TFBertForSequenceClassification.from_pretrained( model_name, **model_kwargs) elif model_name in ModelsByFamily.Roberta: self._tokenizer = RobertaTokenizer.from_pretrained( model_name, **tokenizer_kwargs) self._model = TFRobertaForSequenceClassification.from_pretrained( model_name, **model_kwargs) elif model_name in ModelsByFamily.XLNet: self._tokenizer = XLNetTokenizer.from_pretrained( model_name, **tokenizer_kwargs) self._model = TFXLNetForSequenceClassification.from_pretrained( model_name, **model_kwargs) elif model_name in ModelsByFamily.DistilBert: self._tokenizer = DistilBertTokenizer.from_pretrained( model_name, **tokenizer_kwargs) self._model = TFDistilBertForSequenceClassification.from_pretrained( model_name, **model_kwargs) assert self._tokenizer and self._model
def build_estimator(self): model = TFRobertaForSequenceClassification.from_pretrained( ROBERTA_BASE) optimizer = AdamWeightDecay( learning_rate=LEARNING_RATE, epsilon=EPSILON, weight_decay_rate=DECAY, beta_1=BETA) # we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy loss = SparseCategoricalCrossentropy(from_logits=True) metric = SparseCategoricalAccuracy('accuracy') model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) self.model = model
def test_TFRobertaForSequenceClassification(self): from transformers import RobertaTokenizer, TFRobertaForSequenceClassification pretrained_weights = 'roberta-base' tokenizer = RobertaTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFRobertaForSequenceClassification.from_pretrained( pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def load_model(model_name=MODEL_NAME, rm=False): root = MODELS_FOLDER if model_name == ROBERTA_MODEL: model_filename = ROBERTA_FILENAME saved_model_path = os.path.join(root, model_filename) model = TFRobertaForSequenceClassification.from_pretrained( saved_model_path) else: model_filename = WORD2VEC_FILENAME saved_model_path = os.path.join(root, model_filename) model = load_model(saved_model_path) print(colored(f"=> loaded model {model_filename}", 'green')) if rm: os.system(f'rm -r {saved_model_path}') return model
def main(): # load saved model model = TFRobertaForSequenceClassification.from_pretrained('reddit_model5') list_of_subreddit = ['showerthoughts', 'askmen', 'askreddit', 'jokes', 'worldnews'] for j in list_of_subreddit: # get 10 hot posts from the MachineLearning subreddit top_posts = reddit.subreddit(j).top('week', limit=10) comment_list = [] # save subreddit comments in dataframe for submission in top_posts: submission_comm = reddit.submission(id=submission.id) for count, top_level_comment in enumerate(submission_comm.comments): try: replies_of(top_level_comment, comment_list) except: continue comment_dataframe = pd.DataFrame(comment_list, columns=['Comments']) comment_dataframe['label'] = 0 print(comment_dataframe) # prepare data as per RoBERTa model input submission_sentences_modified = tf.data.Dataset.from_tensor_slices((comment_dataframe['Comments'], comment_dataframe['label'])) ds_submission_encoded = encode_examples(submission_sentences_modified).batch(batch_size) # predict sentiment of Reddit comments submission_pre = tf.nn.softmax(model.predict(ds_submission_encoded)) submission_pre_argmax = tf.math.argmax(submission_pre, axis=1) comment_dataframe['label'] = submission_pre_argmax negative_comments_count = comment_dataframe[comment_dataframe['label'] == 1].count() positive_comments_count = comment_dataframe[comment_dataframe['label'] == 0].count() print(f"overall sentiment of subreddit r/{j} are Positive comments: {positive_comments_count}" f" Negative comments: {negative_comments_count}")
def start_train(train_encodings, train_labels, val_encodings, val_labels): train_dataset = tf.data.Dataset.from_tensor_slices( ( #creates a tensorflow dataset object that can be used to train dict(train_encodings), train_labels)) val_dataset = tf.data.Dataset.from_tensor_slices( (dict(val_encodings), val_labels)) K.clear_session() #initializes random parameters model = TFRobertaForSequenceClassification.from_pretrained('roberta-large') # this established the learning rate. Adam optimization is a stochastic gradient descent method that is based on # adaptive estimation of first-order and second-order moments. optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5) #compiles the model to be ready to train model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy']) #starts training model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16, validation_data=val_dataset.shuffle(100).batch(16)) return model
def run(): #load and prepare data train, test = load_data() train, test = prepare_input(train), prepare_input(test, True) #train-test split train_texts, val_texts, train_labels, val_labels = train_test_split( list(train["input"].values), list(train["label_numeric"].values), test_size=.2, random_state=5) #tokenize and train tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large') train_encodings, val_encodings = tokenize_data(train_texts, tokenizer), tokenize_data( val_texts, tokenizer) model = TFRobertaForSequenceClassification.from_pretrained( "data/roberta_model") #validate and predict on test and write test output validate_model(model, tokenizer, val_texts, val_labels) predict_on_test(model, tokenizer, test)
test_path = f"vectors/test_{length}.mm" mode = 3 batch_size = 250 empty = np.zeros(25000) # Load data and setup generators. dev = np.array( np.memmap(dev_path, dtype='int32', mode='r', shape=(25000, 3, length))) test = np.array( np.memmap(test_path, dtype='int32', mode='r', shape=(25000, 3, length))) train = np.array( np.memmap(train_path, dtype='int32', mode='r', shape=(50000, 3, length))) dev = GeneratorBERT(dev, empty, batch_size, mode) test = GeneratorBERT(test, empty, batch_size, mode) train = GeneratorBERT(train, empty, batch_size, mode) model = TFRobertaForSequenceClassification.from_pretrained( model_path, config='roberta-base', from_pt=True, num_labels=1000) # Load fine tuned model. model.load_weights('bert_model.h5') print("Predicting.") predict_vec = np.memmap('vectors/dev_bert.mm', dtype='float32', mode='w+', shape=(25000, 1000)) predict_vec2 = np.memmap('vectors/test_bert.mm', dtype='float32', mode='w+', shape=(25000, 1000)) predict_vec3 = np.memmap('vectors/train_bert.mm', dtype='float32', mode='w+', shape=(50000, 1000))
def main(): parser = argparse.ArgumentParser( description= 'Script for running text topic classification with transformers package' ) parser.add_argument( '-m', '--model', choices=[ 'bert-base-uncased', 'bert-large-uncased', 'roberta-base', 'roberta-large', 'distilbert-base-uncased', 'google/electra-base-discriminator' ], help='Class of Model Architecture to use for classification') parser.add_argument('-b', '--BATCH_SIZE', default=64, type=int, help='batch size to use per replica') parser.add_argument( '-l', '--SEQUENCE_LENGTH', default=128, type=int, help= 'maximum sequence length. short sequences are padded. long are truncated' ) parser.add_argument( '-e', '--EPOCHS', default=5, type=int, help= 'the number of passes over the dataset to run. early stopping with 2 epoch patience is used' ) args = parser.parse_args() if args.model[:4] == 'robe': # Use Roberta tokenizer TOKENIZER = RobertaTokenizer.from_pretrained(args.model) else: # Use Bert tokenizer TOKENIZER = BertTokenizer.from_pretrained(args.model) train_sentences, train_labels = gather_data(TRAINING_DATA) val_sentences, val_labels = gather_data(VAL_DATA) print(f'Length of Training Set: {len(train_sentences)}') print(f'Length of Test Set: {len(val_sentences)}') training_dataset = create_dataset(train_sentences, train_labels, args.SEQUENCE_LENGTH, TOKENIZER) val_dataset = create_dataset(val_sentences, val_labels, args.SEQUENCE_LENGTH, TOKENIZER) print(f'Maximum Sequence Length: {args.SEQUENCE_LENGTH}') mirrored_strategy = tf.distribute.MirroredStrategy() print(f'Number of devices: {mirrored_strategy.num_replicas_in_sync}') BATCH_SIZE_PER_REPLICA = args.BATCH_SIZE GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * mirrored_strategy.num_replicas_in_sync print(f'Global Batch Size: {GLOBAL_BATCH_SIZE}') batched_training_dataset = training_dataset.shuffle(1024).batch( GLOBAL_BATCH_SIZE, drop_remainder=True) batched_val_dataset = val_dataset.shuffle(1024).batch(GLOBAL_BATCH_SIZE, drop_remainder=True) #dist_train_dataset = mirrored_strategy.experimental_distribute_dataset(batched_training_dataset) #dist_val_dataset = mirrored_strategy.experimental_distribute_dataset(batched_val_dataset) with mirrored_strategy.scope(): if args.model[:4] == 'bert': model = TFBertForSequenceClassification.from_pretrained( args.model, num_labels=4) elif args.model[:4] == 'robe': model = TFRobertaForSequenceClassification.from_pretrained( args.model, num_labels=4) elif args.model[:5] == 'distil': model = TFDistilBertForSequenceClassification.from_pretrained( args.model, num_labels=4) else: model = TFElectraForSequenceClassification.from_pretrained( args.model, num_labels=4) optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) METRICS = [tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')] model.compile(optimizer=optimizer, loss=loss, metrics=METRICS) # Use an early stopping callback and our timing callback early_stop = tf.keras.callbacks.EarlyStopping(verbose=1, patience=2, min_delta=0.005, restore_best_weights=True) time_callback = TimeHistory() history = model.fit(batched_training_dataset, epochs=args.EPOCHS, validation_data=batched_val_dataset, callbacks=[early_stop, time_callback]) df = pd.DataFrame(history.history) df['times'] = time_callback.times df.to_pickle( f'{args.model}_BS{args.BATCH_SIZE}_SEQ{args.SEQUENCE_LENGTH}.pkl') model.save_pretrained( f'./{args.model}_BS{args.BATCH_SIZE}_SEQ{args.SEQUENCE_LENGTH}/')
def load(self): """ Loads a model from path specified by the active ModelConfig and set model label """ self._model = TFRobertaForSequenceClassification.from_pretrained( self._model_path(), config=self._config)
def _create_new_model(self, model_name_str): return TFRobertaForSequenceClassification.from_pretrained( model_name_str, config=self._config)
def on_epoch_end(self, batch, logs={}): os.mkdir('reddit_model' + str(self.count_n)) self.model.save_pretrained( 'reddit_model' + str(self.count_n) ) # this folder address should match with folder we created above y_val_pred = tf.nn.softmax(self.model.predict(ds_test_encoded)) y_pred_argmax = tf.math.argmax(y_val_pred, axis=1) testing_copy = testing_sentences.copy() testing_copy['predicted'] = y_pred_argmax f1_s = f1_score(testing_sentences['label'], testing_copy['predicted']) print('\n f1 score is :', f1_s) self.count_n += 1 metrics = ModelMetrics() # model initialization model = TFRobertaForSequenceClassification.from_pretrained("roberta-base") optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08) # we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_test_encoded, callbacks=[metrics])