示例#1
0
 def test_TFRobertaForSequenceClassification(self):
     from transformers import RobertaConfig, TFRobertaForSequenceClassification
     keras.backend.clear_session()
     # pretrained_weights = 'roberta-base'
     tokenizer_file = 'roberta_roberta-base.pickle'
     tokenizer = self._get_tokenzier(tokenizer_file)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     config = RobertaConfig()
     model = TFRobertaForSequenceClassification(config)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def getBertModel():
    # def f1(y_true, y_pred):
    #     def recall(y_true, y_pred):
    #         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    #         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    #         recall = true_positives / (possible_positives + K.epsilon())
    #         return recall

    #     def precision(y_true, y_pred):
    #         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    #         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    #         precision = true_positives / (predicted_positives + K.epsilon())
    #         return precision

    #     precision = precision(y_true, y_pred)
    #     recall = recall(y_true, y_pred)
    #     return 2*((precision*recall)/(precision+recall+K.epsilon()))

    bertModel = TFRobertaForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=len(PROP_CLASS))
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0)
    # loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    # metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    # loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    # metric = tf.keras.metrics.CategoricalAccuracy('categorical_accuracy')
    loss = "binary_crossentropy"
    metric = "accuracy"

    # bertModel.compile(optimizer=optimizer, loss=loss, metrics=[metric, f1])
    bertModel.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return bertModel
示例#3
0
    def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs):
        if model_name not in ModelsByFamily.Supported:
            raise ValueError(f'Model {model_name} not supported.')

        do_lower_case = False
        if 'uncased' in model_name.lower():
            do_lower_case = True
        tokenizer_kwargs.update({'do_lower_case': do_lower_case})

        self._tokenizer = None
        self._model = None

        if model_name in ModelsByFamily.Bert:
            self._tokenizer = BertTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFBertForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)
        elif model_name in ModelsByFamily.Roberta:
            self._tokenizer = RobertaTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFRobertaForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)
        elif model_name in ModelsByFamily.XLNet:
            self._tokenizer = XLNetTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFXLNetForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)
        elif model_name in ModelsByFamily.DistilBert:
            self._tokenizer = DistilBertTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFDistilBertForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)

        assert self._tokenizer and self._model
示例#4
0
 def build_estimator(self):
     model = TFRobertaForSequenceClassification.from_pretrained(
         ROBERTA_BASE)
     optimizer = AdamWeightDecay(
         learning_rate=LEARNING_RATE, epsilon=EPSILON, weight_decay_rate=DECAY, beta_1=BETA)
     # we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
     loss = SparseCategoricalCrossentropy(from_logits=True)
     metric = SparseCategoricalAccuracy('accuracy')
     model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
     self.model = model
示例#5
0
 def test_TFRobertaForSequenceClassification(self):
     from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
     pretrained_weights = 'roberta-base'
     tokenizer = RobertaTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFRobertaForSequenceClassification.from_pretrained(
         pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx,
                          predictions, self.model_files))
示例#6
0
def load_model(model_name=MODEL_NAME, rm=False):
    root = MODELS_FOLDER

    if model_name == ROBERTA_MODEL:
        model_filename = ROBERTA_FILENAME
        saved_model_path = os.path.join(root, model_filename)
        model = TFRobertaForSequenceClassification.from_pretrained(
            saved_model_path)
    else:
        model_filename = WORD2VEC_FILENAME
        saved_model_path = os.path.join(root, model_filename)
        model = load_model(saved_model_path)

    print(colored(f"=> loaded model {model_filename}", 'green'))

    if rm:
        os.system(f'rm -r {saved_model_path}')
    return model
def main():
    # load saved model
    model = TFRobertaForSequenceClassification.from_pretrained('reddit_model5')

    list_of_subreddit = ['showerthoughts', 'askmen', 'askreddit', 'jokes', 'worldnews']
    for j in list_of_subreddit:
        # get 10 hot posts from the MachineLearning subreddit
        top_posts = reddit.subreddit(j).top('week', limit=10)
        comment_list = []
        # save subreddit comments in dataframe
        for submission in top_posts:
            submission_comm = reddit.submission(id=submission.id)

            for count, top_level_comment in enumerate(submission_comm.comments):
                try:
                    replies_of(top_level_comment, comment_list)
                except:
                    continue

        comment_dataframe = pd.DataFrame(comment_list, columns=['Comments'])
        comment_dataframe['label'] = 0
        print(comment_dataframe)

        # prepare data as per RoBERTa model input
        submission_sentences_modified = tf.data.Dataset.from_tensor_slices((comment_dataframe['Comments'],
                                                                            comment_dataframe['label']))
        ds_submission_encoded = encode_examples(submission_sentences_modified).batch(batch_size)

        # predict sentiment of Reddit comments
        submission_pre = tf.nn.softmax(model.predict(ds_submission_encoded))
        submission_pre_argmax = tf.math.argmax(submission_pre, axis=1)
        comment_dataframe['label'] = submission_pre_argmax

        negative_comments_count = comment_dataframe[comment_dataframe['label'] == 1].count()
        positive_comments_count = comment_dataframe[comment_dataframe['label'] == 0].count()

        print(f"overall sentiment of subreddit r/{j} are Positive comments: {positive_comments_count}"
              f" Negative comments: {negative_comments_count}")
示例#8
0
def start_train(train_encodings, train_labels, val_encodings, val_labels):
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (  #creates a tensorflow dataset object that can be used to train
            dict(train_encodings), train_labels))

    val_dataset = tf.data.Dataset.from_tensor_slices(
        (dict(val_encodings), val_labels))

    K.clear_session()  #initializes random parameters
    model = TFRobertaForSequenceClassification.from_pretrained('roberta-large')
    # this established the learning rate. Adam optimization is a stochastic gradient descent method that is based on
    # adaptive estimation of first-order and second-order moments.
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    #compiles the model to be ready to train
    model.compile(optimizer=optimizer,
                  loss=model.compute_loss,
                  metrics=['accuracy'])
    #starts training
    model.fit(train_dataset.shuffle(1000).batch(16),
              epochs=3,
              batch_size=16,
              validation_data=val_dataset.shuffle(100).batch(16))
    return model
示例#9
0
def run():
    #load and prepare data
    train, test = load_data()
    train, test = prepare_input(train), prepare_input(test, True)

    #train-test split
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        list(train["input"].values),
        list(train["label_numeric"].values),
        test_size=.2,
        random_state=5)

    #tokenize and train
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')
    train_encodings, val_encodings = tokenize_data(train_texts,
                                                   tokenizer), tokenize_data(
                                                       val_texts, tokenizer)
    model = TFRobertaForSequenceClassification.from_pretrained(
        "data/roberta_model")

    #validate and predict on test and write test output
    validate_model(model, tokenizer, val_texts, val_labels)
    predict_on_test(model, tokenizer, test)
示例#10
0
test_path = f"vectors/test_{length}.mm"
mode = 3
batch_size = 250

empty = np.zeros(25000)
# Load data and setup generators.
dev = np.array(
    np.memmap(dev_path, dtype='int32', mode='r', shape=(25000, 3, length)))
test = np.array(
    np.memmap(test_path, dtype='int32', mode='r', shape=(25000, 3, length)))
train = np.array(
    np.memmap(train_path, dtype='int32', mode='r', shape=(50000, 3, length)))
dev = GeneratorBERT(dev, empty, batch_size, mode)
test = GeneratorBERT(test, empty, batch_size, mode)
train = GeneratorBERT(train, empty, batch_size, mode)
model = TFRobertaForSequenceClassification.from_pretrained(
    model_path, config='roberta-base', from_pt=True, num_labels=1000)
# Load fine tuned model.
model.load_weights('bert_model.h5')
print("Predicting.")
predict_vec = np.memmap('vectors/dev_bert.mm',
                        dtype='float32',
                        mode='w+',
                        shape=(25000, 1000))
predict_vec2 = np.memmap('vectors/test_bert.mm',
                         dtype='float32',
                         mode='w+',
                         shape=(25000, 1000))
predict_vec3 = np.memmap('vectors/train_bert.mm',
                         dtype='float32',
                         mode='w+',
                         shape=(50000, 1000))
示例#11
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script for running text topic classification with transformers package'
    )
    parser.add_argument(
        '-m',
        '--model',
        choices=[
            'bert-base-uncased', 'bert-large-uncased', 'roberta-base',
            'roberta-large', 'distilbert-base-uncased',
            'google/electra-base-discriminator'
        ],
        help='Class of Model Architecture to use for classification')
    parser.add_argument('-b',
                        '--BATCH_SIZE',
                        default=64,
                        type=int,
                        help='batch size to use per replica')
    parser.add_argument(
        '-l',
        '--SEQUENCE_LENGTH',
        default=128,
        type=int,
        help=
        'maximum sequence length. short sequences are padded. long are truncated'
    )
    parser.add_argument(
        '-e',
        '--EPOCHS',
        default=5,
        type=int,
        help=
        'the number of passes over the dataset to run. early stopping with 2 epoch patience is used'
    )

    args = parser.parse_args()

    if args.model[:4] == 'robe':
        # Use Roberta tokenizer
        TOKENIZER = RobertaTokenizer.from_pretrained(args.model)
    else:
        # Use Bert tokenizer
        TOKENIZER = BertTokenizer.from_pretrained(args.model)

    train_sentences, train_labels = gather_data(TRAINING_DATA)
    val_sentences, val_labels = gather_data(VAL_DATA)

    print(f'Length of Training Set: {len(train_sentences)}')
    print(f'Length of Test Set: {len(val_sentences)}')

    training_dataset = create_dataset(train_sentences, train_labels,
                                      args.SEQUENCE_LENGTH, TOKENIZER)
    val_dataset = create_dataset(val_sentences, val_labels,
                                 args.SEQUENCE_LENGTH, TOKENIZER)

    print(f'Maximum Sequence Length: {args.SEQUENCE_LENGTH}')

    mirrored_strategy = tf.distribute.MirroredStrategy()
    print(f'Number of devices: {mirrored_strategy.num_replicas_in_sync}')

    BATCH_SIZE_PER_REPLICA = args.BATCH_SIZE
    GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * mirrored_strategy.num_replicas_in_sync
    print(f'Global Batch Size: {GLOBAL_BATCH_SIZE}')

    batched_training_dataset = training_dataset.shuffle(1024).batch(
        GLOBAL_BATCH_SIZE, drop_remainder=True)
    batched_val_dataset = val_dataset.shuffle(1024).batch(GLOBAL_BATCH_SIZE,
                                                          drop_remainder=True)

    #dist_train_dataset = mirrored_strategy.experimental_distribute_dataset(batched_training_dataset)
    #dist_val_dataset = mirrored_strategy.experimental_distribute_dataset(batched_val_dataset)

    with mirrored_strategy.scope():
        if args.model[:4] == 'bert':
            model = TFBertForSequenceClassification.from_pretrained(
                args.model, num_labels=4)
        elif args.model[:4] == 'robe':
            model = TFRobertaForSequenceClassification.from_pretrained(
                args.model, num_labels=4)
        elif args.model[:5] == 'distil':
            model = TFDistilBertForSequenceClassification.from_pretrained(
                args.model, num_labels=4)
        else:
            model = TFElectraForSequenceClassification.from_pretrained(
                args.model, num_labels=4)

        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        METRICS = [tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')]
        model.compile(optimizer=optimizer, loss=loss, metrics=METRICS)

    # Use an early stopping callback and our timing callback
    early_stop = tf.keras.callbacks.EarlyStopping(verbose=1,
                                                  patience=2,
                                                  min_delta=0.005,
                                                  restore_best_weights=True)

    time_callback = TimeHistory()

    history = model.fit(batched_training_dataset,
                        epochs=args.EPOCHS,
                        validation_data=batched_val_dataset,
                        callbacks=[early_stop, time_callback])

    df = pd.DataFrame(history.history)
    df['times'] = time_callback.times

    df.to_pickle(
        f'{args.model}_BS{args.BATCH_SIZE}_SEQ{args.SEQUENCE_LENGTH}.pkl')
    model.save_pretrained(
        f'./{args.model}_BS{args.BATCH_SIZE}_SEQ{args.SEQUENCE_LENGTH}/')
示例#12
0
 def load(self):
     """
     Loads a model from path specified by the active ModelConfig and set model label
     """
     self._model = TFRobertaForSequenceClassification.from_pretrained(
         self._model_path(), config=self._config)
示例#13
0
 def _create_new_model(self, model_name_str):
     return TFRobertaForSequenceClassification.from_pretrained(
         model_name_str, config=self._config)
    def on_epoch_end(self, batch, logs={}):

        os.mkdir('reddit_model' + str(self.count_n))
        self.model.save_pretrained(
            'reddit_model' + str(self.count_n)
        )  # this folder address should match with folder we created above

        y_val_pred = tf.nn.softmax(self.model.predict(ds_test_encoded))
        y_pred_argmax = tf.math.argmax(y_val_pred, axis=1)
        testing_copy = testing_sentences.copy()
        testing_copy['predicted'] = y_pred_argmax
        f1_s = f1_score(testing_sentences['label'], testing_copy['predicted'])
        print('\n f1 score is :', f1_s)
        self.count_n += 1


metrics = ModelMetrics()

# model initialization
model = TFRobertaForSequenceClassification.from_pretrained("roberta-base")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                     epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.fit(ds_train_encoded,
          epochs=number_of_epochs,
          validation_data=ds_test_encoded,
          callbacks=[metrics])