Пример #1
0
    # overwrite batch size until we have tf_glue.py
    args.per_device_train_batch_size = 16
    args.per_device_eval_batch_size = 16

    # Set up logging
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName("INFO"),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )

    # Load model and tokenizer
    model = TFAutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    # Load dataset
    train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])
    train_dataset = train_dataset.shuffle().select(
        range(5000)
    )  # smaller the size for train dataset to 5k
    test_dataset = test_dataset.shuffle().select(
        range(500)
    )  # smaller the size for test dataset to 500

    # Preprocess train dataset
    train_dataset = train_dataset.map(
        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
    )
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )
    output_dir = Path(training_args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # endregion

    # region Checkpoints
    # Detecting last checkpoint.
    checkpoint = None
    if len(os.listdir(training_args.output_dir)
           ) > 0 and not training_args.overwrite_output_dir:
        if (output_dir / CONFIG_NAME).is_file() and (
                output_dir / TF2_WEIGHTS_NAME).is_file():
            checkpoint = output_dir
            logger.info(
                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
        else:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to continue regardless.")

    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO)

    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # region Loading data
    # For CSV/JSON files, this script will use the 'label' field as the label and the 'sentence1' and optionally
    # 'sentence2' fields as inputs if they exist. If not, the first two fields not named label are used if at least two
    # columns are provided. Note that the term 'sentence' can be slightly misleading, as they often contain more than
    # a single grammatical sentence, when the task requires it.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    data_files = {
        "train": data_args.train_file,
        "validation": data_args.validation_file,
        "test": data_args.test_file
    }
    data_files = {
        key: file
        for key, file in data_files.items() if file is not None
    }

    for key in data_files.keys():
        logger.info(f"Loading a local file for {key}: {data_files[key]}")

    if data_args.input_file_extension == "csv":
        # Loading a dataset from local csv files
        datasets = load_dataset("csv",
                                data_files=data_files,
                                cache_dir=model_args.cache_dir)
    else:
        # Loading a dataset from local json files
        datasets = load_dataset("json",
                                data_files=data_files,
                                cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    # endregion

    # region Label preprocessing
    # If you've passed us a training set, we try to infer your labels from it
    if "train" in datasets:
        # By default we assume that if your label column looks like a float then you're doing regression,
        # and if not then you're doing classification. This is something you may want to change!
        is_regression = datasets["train"].features["label"].dtype in [
            "float32", "float64"
        ]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)
    # If you haven't passed a training set, we read label info from the saved model (this happens later)
    else:
        num_labels = None
        label_list = None
        is_regression = None
    # endregion

    # region Load model config and tokenizer
    if checkpoint is not None:
        config_path = training_args.output_dir
    elif model_args.config_name:
        config_path = model_args.config_name
    else:
        config_path = model_args.model_name_or_path
    if num_labels is not None:
        config = AutoConfig.from_pretrained(
            config_path,
            num_labels=num_labels,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        config = AutoConfig.from_pretrained(
            config_path,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Dataset preprocessing
    # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
    column_names = {
        col
        for cols in datasets.column_names.values() for col in cols
    }
    non_label_column_names = [name for name in column_names if name != "label"]
    if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", "sentence2"
    elif "sentence1" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", None
    else:
        if len(non_label_column_names) >= 2:
            sentence1_key, sentence2_key = non_label_column_names[:2]
        else:
            sentence1_key, sentence2_key = non_label_column_names[0], None

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    # Ensure that our labels match the model's, if it has some pre-specified
    if "train" in datasets:
        if not is_regression and config.label2id != PretrainedConfig(
                num_labels=num_labels).label2id:
            label_name_to_id = config.label2id
            if list(sorted(label_name_to_id.keys())) == list(
                    sorted(label_list)):
                label_to_id = label_name_to_id  # Use the model's labels
            else:
                logger.warning(
                    "Your model seems to have been trained with labels, but they don't match the dataset: ",
                    f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                    "\nIgnoring the model labels as a result.",
                )
                label_to_id = {v: i for i, v in enumerate(label_list)}
        elif not is_regression:
            label_to_id = {v: i for i, v in enumerate(label_list)}
        else:
            label_to_id = None
        # Now we've established our label2id, let's overwrite the model config with it.
        config.label2id = label_to_id
        if config.label2id is not None:
            config.id2label = {id: label for label, id in label_to_id.items()}
        else:
            config.id2label = None
    else:
        label_to_id = config.label2id  # Just load the data from the model

    if "validation" in datasets and config.label2id is not None:
        validation_label_list = datasets["validation"].unique("label")
        for val_label in validation_label_list:
            assert val_label in label_to_id, f"Label {val_label} is in the validation set but not the training set!"

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args, max_length=max_seq_length, truncation=True)

        # Map labels to IDs
        if config.label2id is not None and "label" in examples:
            result["label"] = [(config.label2id[l] if l != -1 else -1)
                               for l in examples["label"]]
        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)
    # endregion

    with training_args.strategy.scope():
        # region Load pretrained model
        # Set seed before initializing model
        set_seed(training_args.seed)
        #
        # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
        # download model & vocab.
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        # endregion

        # region Optimizer, loss and compilation
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=training_args.learning_rate,
            beta_1=training_args.adam_beta1,
            beta_2=training_args.adam_beta2,
            epsilon=training_args.adam_epsilon,
            clipnorm=training_args.max_grad_norm,
        )
        if is_regression:
            loss_fn = tf.keras.losses.MeanSquaredError()
            metrics = []
        else:
            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True)
            metrics = ["accuracy"]
        model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
        # endregion

        # region Convert data to a tf.data.Dataset

        tf_data = dict()
        max_samples = {
            "train": data_args.max_train_samples,
            "validation": data_args.max_val_samples,
            "test": data_args.max_test_samples,
        }
        for key in ("train", "validation", "test"):
            if key not in datasets:
                tf_data[key] = None
                continue
            if key in ("train", "validation"):
                assert "label" in datasets[
                    key].features, f"Missing labels from {key} data!"
            if key == "train":
                shuffle = True
                batch_size = training_args.per_device_train_batch_size
                drop_remainder = True  # Saves us worrying about scaling gradients for the last batch
            else:
                shuffle = False
                batch_size = training_args.per_device_eval_batch_size
                drop_remainder = False
            samples_limit = max_samples[key]
            dataset = datasets[key]
            if samples_limit is not None:
                dataset = dataset.select(range(samples_limit))
            if isinstance(
                    training_args.strategy,
                    tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
                logger.info(
                    "Padding all batches to max length because argument was set or we're on TPU."
                )
                dataset_mode = "constant_batch"
            else:
                dataset_mode = "variable_batch"
            data = convert_dataset_for_tensorflow(
                dataset,
                non_label_column_names,
                batch_size=batch_size,
                dataset_mode=dataset_mode,
                drop_remainder=drop_remainder,
                shuffle=shuffle,
            )
            tf_data[key] = data
        # endregion

        # region Training and validation
        if tf_data["train"] is not None:
            callbacks = [
                SavePretrainedCallback(output_dir=training_args.output_dir)
            ]
            model.fit(
                tf_data["train"],
                validation_data=tf_data["validation"],
                epochs=int(training_args.num_train_epochs),
                callbacks=callbacks,
            )
        elif tf_data["validation"] is not None:
            # If there's a validation dataset but no training set, just evaluate the metrics
            logger.info("Computing metrics on validation data...")
            if is_regression:
                loss = model.evaluate(tf_data["validation"])
                logger.info(f"Loss: {loss:.5f}")
            else:
                loss, accuracy = model.evaluate(tf_data["validation"])
                logger.info(
                    f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%")
        # endregion

        # region Prediction
        if tf_data["test"] is not None:
            logger.info("Doing predictions on test dataset...")
            predictions = model.predict(tf_data["test"])["logits"]
            predicted_class = np.squeeze(
                predictions) if is_regression else np.argmax(predictions,
                                                             axis=1)
            output_test_file = os.path.join(training_args.output_dir,
                                            "test_results.txt")
            with open(output_test_file, "w") as writer:
                writer.write("index\tprediction\n")
                for index, item in enumerate(predicted_class):
                    if is_regression:
                        writer.write(f"{index}\t{item:3.3f}\n")
                    else:
                        item = config.id2label[item]
                        writer.write(f"{index}\t{item}\n")
            logger.info(f"Wrote predictions to {output_test_file}!")
        # endregion

    # region Prediction losses
    # This section is outside the scope() because it's very quick to compute, but behaves badly inside it
    if "test" in datasets and "label" in datasets["test"].features:
        print("Computing prediction loss on test labels...")
        labels = datasets["test"]["label"]
        loss = float(loss_fn(labels, predictions).numpy())
        print(f"Test loss: {loss:.4f}")
Пример #3
0
    def train(self):
        import tensorflow_addons as tfa
        import tensorflow as tf

        self.scheduler = None
        # Encoding train_texts and val_texts

        # x-digit code -> unique number
        self.classes = utils.Bidict(
            {c: i
             for i, c in enumerate(self.df_train['cat1'].unique())})

        # Convert datasets to transformer format
        train_dataset = self._create_transformer_dataset(self.df_train)

        val_data = train_dataset
        if settings.VALID_PER_CLASS:
            val_data = self._create_transformer_dataset(self.df_valid)
        val_data = val_data.batch(settings.MINIBATCH)

        # Fine-tune the pre-trained model
        if self.train_with_tensorflow_directly:
            from transformers import TFAutoModelForSequenceClassification
            model = TFAutoModelForSequenceClassification.from_pretrained(
                self.pretrained_model_name,
                num_labels=len(self.classes),
                from_pt=True)

            # In AdamW weight decay value had no effect in our tests.
            # hard to find stable tuning, harder to beat vanilla Adam
            # optimizer = AdamW(
            #     learning_rate=0.00005,
            #     # 0.1 (default but might be too large), 0.01
            #     weight_decay=0.002
            # )
            if 1:
                learning_rate = settings.LEARNING_RATE

                if learning_rate is None:
                    if settings.FULL_TEXT:
                        # 5e-5 can fail on L1 e.g. Seed 7
                        # 1e-5 looks OK (TBC) on L1
                        # but not on L3 (no convergence)
                        # 5e-6 very slow
                        # Candidates: 5E-06 (40%), 7E-06 (49%)
                        # FOR L3
                        # learning_rate = 7E-06
                        # FOR L1
                        # learning_rate = 5e-6
                        # legal-small
                        ## learning_rate = 7e-6
                        # legal-base
                        learning_rate = 1e-5
                    else:
                        # learning_rate = 5e-5
                        learning_rate = 3e-5

                print(f'\n LR = {learning_rate:0.0E}')
                from transformers import AdamWeightDecay
                optimizer = AdamWeightDecay(
                    learning_rate=learning_rate,
                    # 0.1 (default but might be too large), 0.01
                    # weight_decay_rate=0.0
                )
            else:
                if settings.FULL_TEXT:
                    learning_rate = 1e-3
                else:
                    learning_rate = 7e-3

                print(f'\n LR = {learning_rate:0.0E}')
                optimizer = tfa.optimizers.SGDW(
                    learning_rate=learning_rate,
                    momentum=0.0,
                    weight_decay=0.00
                    # 0.1 (default but might be too large), 0.01
                    # weight_decay_rate=0.5
                )

            # optimizer = tf.keras.optimizers.MomentumOptimizer(
            #     learning_rate=self.learning_rate
            # )
            model.compile(optimizer=optimizer,
                          loss=model.compute_loss,
                          metrics=['accuracy'])

            callbacks = None

            self.scheduler = None
            if self.scheduler_class:
                self.scheduler = self.scheduler_class()
                callbacks = [self.scheduler]

                self.scheduler.pretrain(model, train_dataset)

            class_weight = None
            if settings.CLASS_WEIGHT:
                class_sizes = self.df_train.cat1.value_counts()
                class_sizes_max = class_sizes.max()
                class_weight = {
                    i: class_sizes_max / class_sizes[c]
                    for c, i in self.classes.items()
                }

            self.history = model.fit(
                train_dataset.batch(settings.MINIBATCH),
                epochs=self.max_epochs,
                batch_size=settings.MINIBATCH,
                validation_data=val_data,
                callbacks=callbacks,
                class_weight=class_weight,
            )
            self.model = model

            self.render_training()
        else:
            # TODO: fix this. Almost immediate but random results
            #  compared to TF method above.
            from transformers import (TFTrainer, TFTrainingArguments,
                                      TFAutoModelForSequenceClassification)
            # from transformers import TFAutoModel

            training_args = TFTrainingArguments(
                output_dir=self.results_path,  # output directory
                num_train_epochs=self.
                max_epochs,  # total number of training epochs
                per_device_train_batch_size=settings.MINIBATCH,
                # batch size per device during training
                per_device_eval_batch_size=64,  # batch size for evaluation
                warmup_steps=500,
                # number of warmup steps for learning rate scheduler
                weight_decay=0.01,  # strength of weight decay
                logging_dir=self.logs_path,  # directory for storing logs
            )

            print('h2')

            with training_args.strategy.scope():
                # trainer_model = TFSequenceClassification.from_pretrained(
                #     Transformers.pretrained_model, num_labels=len(self.classes)
                # )
                trainer_model = TFAutoModelForSequenceClassification.from_pretrained(
                    Transformers.pretrained_model_name,
                    num_labels=len(self.classes))

            print('h3')

            trainer = TFTrainer(
                model=trainer_model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=train_dataset,
            )

            print('h4')

            trainer.train()

            # print(trainer.evaluate())

            self.model = trainer_model
Пример #4
0
from transformers import (TFDistilBertForSequenceClassification,
                          TFAutoModelForSequenceClassification,
                          DistilBertTokenizerFast)

# # Configurations

# In[2]:

model_name = 'distilbert-base-uncased'
max_sequence_length = 256

# In[3]:

# model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer = DistilBertTokenizerFast.from_pretrained(
    model_name,
    truncation=True,
    padding=True,
    max_length=max_sequence_length,
    return_tensors="tf")

# # Tokenize
#
# Note that you need to convert the result of the ```tokenizer``` which is ```transformers.tokenization_utils_base.BatchEncoding``` instance into dictionary to feed into the model.

# In[4]:

tokenized = tokenizer(
Пример #5
0
    def copy_model_files(self, force=False):
        modified = False

        src_path = self.checkpoint_path

        d = None
        try:
            if force or not (self.git_path / "tf_model.h5").exists() or not (
                    self.git_path / "pytorch_model.bin").exists():
                d = TemporaryDirectory()
                if self.task in self.QA_TASKS:
                    model = QASparseXP.compile_model(src_path,
                                                     dest_path=d.name)
                elif self.task in self.GLUE_TASKS:
                    model = GlueSparseXP.compile_model(src_path,
                                                       dest_path=d.name)
                elif self.task in self.SUMMARIZATION_TASKS:
                    model = SummarizationSparseXP.compile_model(
                        src_path, dest_path=d.name)
                else:
                    raise Exception(f"Unknown task {self.task}")

                model = optimize_model(model, "heads")
                model.save_pretrained(d.name)
                src_path = d.name
            if force or not (self.git_path / "tf_model.h5").exists():
                with TemporaryDirectory() as d2:
                    if self.task in self.QA_TASKS:
                        QASparseXP.final_fine_tune_bertarize(
                            src_path, d2, remove_head_pruning=True)
                        tf_model = TFAutoModelForQuestionAnswering.from_pretrained(
                            d2, from_pt=True)
                    elif self.task in self.GLUE_TASKS:
                        GlueSparseXP.final_fine_tune_bertarize(
                            src_path, d2, remove_head_pruning=True)
                        tf_model = TFAutoModelForSequenceClassification.from_pretrained(
                            d2, from_pt=True)
                    elif self.task in self.SUMMARIZATION_TASKS:
                        SummarizationSparseXP.final_fine_tune_bertarize(
                            src_path, d2, remove_head_pruning=True)
                        tf_model = TFAutoModelForSeq2SeqLM.from_pretrained(
                            d2, from_pt=True)
                    else:
                        raise Exception(f"Unknown task {self.task}")

                    tf_model.save_pretrained(self.git_path)
                    modified = True

            if force or not (self.git_path / "pytorch_model.bin").exists():
                if self.task in self.QA_TASKS:
                    model = AutoModelForQuestionAnswering.from_pretrained(
                        src_path)
                elif self.task in self.GLUE_TASKS:
                    model = AutoModelForSequenceClassification.from_pretrained(
                        src_path)
                elif self.task in self.SUMMARIZATION_TASKS:
                    model = AutoModelForSeq2SeqLM.from_pretrained(src_path)
                else:
                    raise Exception(f"Unknown task {self.task}")
                model.save_pretrained(self.git_path)
                modified = True

            src_path = Path(src_path)
            to_copy = self.get_copy_list()

            for files, dest in to_copy:
                dest.mkdir(exist_ok=True)
                for file in files:
                    if force or not (dest / file).exists():
                        shutil.copyfile(str(src_path / file), str(dest / file))
                        modified = True
        finally:
            if d is not None:
                d.cleanup()

        # Reload the config, this may have been changed by compilation / optimization (pruned_heads, gelu_patch, layer_norm_patch)
        with (self.git_path / "config.json").open() as f:
            self.checkpoint_info["config"] = json.load(f)

        return modified
Пример #6
0

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(
    range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(
    range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from datetime import datetime

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased",
                                                             num_labels=2)

tf_train_dataset = small_train_dataset.remove_columns(
    ["text"]).with_format("tensorflow")
tf_eval_dataset = small_eval_dataset.remove_columns(
    ["text"]).with_format("tensorflow")

train_features = {
    x: tf_train_dataset[x].to_tensor()
    for x in tokenizer.model_input_names
}
train_tf_dataset = tf.data.Dataset.from_tensor_slices(
    (train_features, tf_train_dataset["label"]))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8)

eval_features = {
Пример #7
0
app = Flask(__name__)
CORS(app)
load_dotenv()
""" Load all the models and such """

default_sentiment = pipeline("sentiment-analysis")
bart_summarization = pipeline("summarization")
t5_summarization = pipeline("summarization",
                            model="t5-small",
                            tokenizer="t5-small",
                            framework="tf")

distilbert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased',
                                                     use_fast=True)
distilbert_regression_model = TFAutoModelForSequenceClassification.from_pretrained(
    "models/distilbert/regression")

# make a directory to download lightfm models to
if not os.path.exists('lightfm'):
    os.makedirs('lightfm')

# get lightFM model and dataset objects
lightFM_model = get_lightfm_model()
lightFM_dataset = get_lightfm_dataset()
""" End of loading models and such """
""" Define prediction and helper function """


class Summarization:
    def __init__(self, model):
        self.model = model
Пример #8
0
 def __init__(self):
     self.config = AutoConfig.from_pretrained(CZERT_MODEL_PATH, num_labels=1)
     self.tokenizer = BertTokenizerFast(os.path.join(CZERT_MODEL_PATH, "vocab.txt"), strip_accents=False,
                                        do_lower_case=False)
     self.model = TFAutoModelForSequenceClassification.from_pretrained(CZERT_MODEL_PATH, config=self.config)
Пример #9
0
    def __call__(self, text_inputs):
        raw_outputs = self.model(text_inputs)
        outputs = []
        for output in raw_outputs:
            score = output["score"]
            if output["label"] == "POSITIVE":
                outputs.append([1 - score, score])
            else:
                outputs.append([score, 1 - score])
        return np.array(outputs)


# Create the model: a French sentiment analysis model.
# see https://github.com/TheophileBlard/french-sentiment-analysis-with-bert
model = TFAutoModelForSequenceClassification.from_pretrained(
    "tblard/tf-allocine")
tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine")
pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

model_wrapper = HuggingFaceSentimentAnalysisPipelineWrapper(pipeline)

# Create the recipe: PWWS uses a WordNet transformation.
recipe = PWWSRen2019.build(model_wrapper)
# WordNet defaults to english. Set the default language to French ('fra')
#
# See
# "Building a free French wordnet from multilingual resources",
# E. L. R. A. (ELRA) (ed.),
# Proceedings of the Sixth International Language Resources and Evaluation (LREC’08).

recipe.transformation.language = "fra"
Пример #10
0
 def _load_local_model(self, path):
     self._model = TFAutoModelForSequenceClassification.from_pretrained(
         path)
     self._tokenizer = AutoTokenizer.from_pretrained(path)
Пример #11
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_text_classification",
                           model_args,
                           data_args,
                           framework="tensorflow")

    output_dir = Path(training_args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # endregion

    # region Checkpoints
    # Detecting last checkpoint.
    checkpoint = None
    if len(os.listdir(training_args.output_dir)
           ) > 0 and not training_args.overwrite_output_dir:
        if (output_dir / CONFIG_NAME).is_file() and (
                output_dir / TF2_WEIGHTS_NAME).is_file():
            checkpoint = output_dir
            logger.info(
                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
        else:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to continue regardless.")

    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO)

    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # region Loading data
    # For CSV/JSON files, this script will use the 'label' field as the label and the 'sentence1' and optionally
    # 'sentence2' fields as inputs if they exist. If not, the first two fields not named label are used if at least two
    # columns are provided. Note that the term 'sentence' can be slightly misleading, as they often contain more than
    # a single grammatical sentence, when the task requires it.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    data_files = {
        "train": data_args.train_file,
        "validation": data_args.validation_file,
        "test": data_args.test_file
    }
    data_files = {
        key: file
        for key, file in data_files.items() if file is not None
    }

    for key in data_files.keys():
        logger.info(f"Loading a local file for {key}: {data_files[key]}")

    if data_args.input_file_extension == "csv":
        # Loading a dataset from local csv files
        datasets = load_dataset(
            "csv",
            data_files=data_files,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        # Loading a dataset from local json files
        datasets = load_dataset("json",
                                data_files=data_files,
                                cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    # endregion

    # region Label preprocessing
    # If you've passed us a training set, we try to infer your labels from it
    if "train" in datasets:
        # By default we assume that if your label column looks like a float then you're doing regression,
        # and if not then you're doing classification. This is something you may want to change!
        is_regression = datasets["train"].features["label"].dtype in [
            "float32", "float64"
        ]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)
    # If you haven't passed a training set, we read label info from the saved model (this happens later)
    else:
        num_labels = None
        label_list = None
        is_regression = None
    # endregion

    # region Load model config and tokenizer
    if checkpoint is not None:
        config_path = training_args.output_dir
    elif model_args.config_name:
        config_path = model_args.config_name
    else:
        config_path = model_args.model_name_or_path
    if num_labels is not None:
        config = AutoConfig.from_pretrained(
            config_path,
            num_labels=num_labels,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        config = AutoConfig.from_pretrained(
            config_path,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Dataset preprocessing
    # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
    column_names = {
        col
        for cols in datasets.column_names.values() for col in cols
    }
    non_label_column_names = [name for name in column_names if name != "label"]
    if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", "sentence2"
    elif "sentence1" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", None
    else:
        if len(non_label_column_names) >= 2:
            sentence1_key, sentence2_key = non_label_column_names[:2]
        else:
            sentence1_key, sentence2_key = non_label_column_names[0], None

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    # Ensure that our labels match the model's, if it has some pre-specified
    if "train" in datasets:
        if not is_regression and config.label2id != PretrainedConfig(
                num_labels=num_labels).label2id:
            label_name_to_id = config.label2id
            if list(sorted(label_name_to_id.keys())) == list(
                    sorted(label_list)):
                label_to_id = label_name_to_id  # Use the model's labels
            else:
                logger.warning(
                    "Your model seems to have been trained with labels, but they don't match the dataset: ",
                    f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels:"
                    f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.",
                )
                label_to_id = {v: i for i, v in enumerate(label_list)}
        elif not is_regression:
            label_to_id = {v: i for i, v in enumerate(label_list)}
        else:
            label_to_id = None
        # Now we've established our label2id, let's overwrite the model config with it.
        config.label2id = label_to_id
        if config.label2id is not None:
            config.id2label = {id: label for label, id in label_to_id.items()}
        else:
            config.id2label = None
    else:
        label_to_id = config.label2id  # Just load the data from the model

    if "validation" in datasets and config.label2id is not None:
        validation_label_list = datasets["validation"].unique("label")
        for val_label in validation_label_list:
            assert val_label in label_to_id, f"Label {val_label} is in the validation set but not the training set!"

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args, max_length=max_seq_length, truncation=True)

        # Map labels to IDs
        if config.label2id is not None and "label" in examples:
            result["label"] = [(config.label2id[l] if l != -1 else -1)
                               for l in examples["label"]]
        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)

    # endregion

    with training_args.strategy.scope():
        # region Load pretrained model
        # Set seed before initializing model
        set_seed(training_args.seed)
        #
        # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
        # download model & vocab.
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        # endregion

        # region Convert data to a tf.data.Dataset
        dataset_options = tf.data.Options()
        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
        num_replicas = training_args.strategy.num_replicas_in_sync

        tf_data = dict()
        max_samples = {
            "train": data_args.max_train_samples,
            "validation": data_args.max_val_samples,
            "test": data_args.max_test_samples,
        }
        for key in ("train", "validation", "test"):
            if key not in datasets:
                tf_data[key] = None
                continue
            if ((key == "train" and not training_args.do_train)
                    or (key == "validation" and not training_args.do_eval)
                    or (key == "test" and not training_args.do_predict)):
                tf_data[key] = None
                continue
            if key in ("train", "validation"):
                assert "label" in datasets[
                    key].features, f"Missing labels from {key} data!"
            if key == "train":
                shuffle = True
                batch_size = training_args.per_device_train_batch_size * num_replicas
            else:
                shuffle = False
                batch_size = training_args.per_device_eval_batch_size * num_replicas
            samples_limit = max_samples[key]
            dataset = datasets[key]
            if samples_limit is not None:
                dataset = dataset.select(range(samples_limit))

            # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
            # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
            # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
            # yourself if you use this method, whereas they are automatically inferred from the model input names when
            # using model.prepare_tf_dataset()
            # For more info see the docs:
            # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
            # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset

            data = model.prepare_tf_dataset(
                dataset,
                shuffle=shuffle,
                batch_size=batch_size,
                tokenizer=tokenizer,
            )
            data = data.with_options(dataset_options)
            tf_data[key] = data
        # endregion

        # region Optimizer, loss and compilation

        if training_args.do_train:
            num_train_steps = len(
                tf_data["train"]) * training_args.num_train_epochs
            if training_args.warmup_steps > 0:
                num_warmup_steps = training_args.warmup_steps
            elif training_args.warmup_ratio > 0:
                num_warmup_steps = int(num_train_steps *
                                       training_args.warmup_ratio)
            else:
                num_warmup_steps = 0

            optimizer, schedule = create_optimizer(
                init_lr=training_args.learning_rate,
                num_train_steps=num_train_steps,
                num_warmup_steps=num_warmup_steps,
                adam_beta1=training_args.adam_beta1,
                adam_beta2=training_args.adam_beta2,
                adam_epsilon=training_args.adam_epsilon,
                weight_decay_rate=training_args.weight_decay,
                adam_global_clipnorm=training_args.max_grad_norm,
            )
        else:
            optimizer = None
        if is_regression:
            metrics = []
        else:
            metrics = ["accuracy"]
        model.compile(optimizer=optimizer, metrics=metrics)
        # endregion

        # region Preparing push_to_hub and model card
        push_to_hub_model_id = training_args.push_to_hub_model_id
        model_name = model_args.model_name_or_path.split("/")[-1]
        if not push_to_hub_model_id:
            push_to_hub_model_id = f"{model_name}-finetuned-text-classification"

        model_card_kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tasks": "text-classification"
        }

        if training_args.push_to_hub:
            callbacks = [
                PushToHubCallback(
                    output_dir=training_args.output_dir,
                    model_id=push_to_hub_model_id,
                    organization=training_args.push_to_hub_organization,
                    token=training_args.push_to_hub_token,
                    tokenizer=tokenizer,
                    **model_card_kwargs,
                )
            ]
        else:
            callbacks = []
        # endregion

        # region Training and validation
        if tf_data["train"] is not None:
            model.fit(
                tf_data["train"],
                validation_data=tf_data["validation"],
                epochs=int(training_args.num_train_epochs),
                callbacks=callbacks,
            )
        if tf_data["validation"] is not None:
            logger.info("Computing metrics on validation data...")
            if is_regression:
                loss = model.evaluate(tf_data["validation"])
                logger.info(f"Eval loss: {loss:.5f}")
            else:
                loss, accuracy = model.evaluate(tf_data["validation"])
                logger.info(
                    f"Eval loss: {loss:.5f}, Eval accuracy: {accuracy * 100:.4f}%"
                )
            if training_args.output_dir is not None:
                output_eval_file = os.path.join(training_args.output_dir,
                                                "all_results.json")
                eval_dict = {"eval_loss": loss}
                if not is_regression:
                    eval_dict["eval_accuracy"] = accuracy
                with open(output_eval_file, "w") as writer:
                    writer.write(json.dumps(eval_dict))
        # endregion

        # region Prediction
        if tf_data["test"] is not None:
            logger.info("Doing predictions on test dataset...")
            predictions = model.predict(tf_data["test"])["logits"]
            predicted_class = np.squeeze(
                predictions) if is_regression else np.argmax(predictions,
                                                             axis=1)
            output_test_file = os.path.join(training_args.output_dir,
                                            "test_results.txt")
            with open(output_test_file, "w") as writer:
                writer.write("index\tprediction\n")
                for index, item in enumerate(predicted_class):
                    if is_regression:
                        writer.write(f"{index}\t{item:3.3f}\n")
                    else:
                        item = config.id2label[item]
                        writer.write(f"{index}\t{item}\n")
            logger.info(f"Wrote predictions to {output_test_file}!")
        # endregion

        if training_args.output_dir is not None and not training_args.push_to_hub:
            # If we're not pushing to hub, at least save a local copy when we're done
            model.save_pretrained(training_args.output_dir)
Пример #12
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_glue",
                           model_args,
                           data_args,
                           framework="tensorflow")

    if not (training_args.do_train or training_args.do_eval
            or training_args.do_predict):
        exit(
            "Must specify at least one of --do_train, --do_eval or --do_predict!"
        )
    # endregion

    # region Checkpoints
    checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        checkpoint = get_last_checkpoint(training_args.output_dir)
        if checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # region Dataset and labels
    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee
    # that only one local process can concurrently download the dataset.
    datasets = load_dataset(
        "glue",
        data_args.task_name,
        cache_dir=model_args.cache_dir,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    is_regression = data_args.task_name == "stsb"
    if not is_regression:
        label_list = datasets["train"].features["label"].names
        num_labels = len(label_list)
    else:
        num_labels = 1

    if data_args.predict_file is not None:
        logger.info("Preparing user-supplied file for predictions...")

        data_files = {"data": data_args.predict_file}

        for key in data_files.keys():
            logger.info(f"Loading a local file for {key}: {data_files[key]}")

        if data_args.predict_file.endswith(".csv"):
            # Loading a dataset from local csv files
            user_dataset = load_dataset("csv",
                                        data_files=data_files,
                                        cache_dir=model_args.cache_dir)
        else:
            # Loading a dataset from local json files
            user_dataset = load_dataset("json",
                                        data_files=data_files,
                                        cache_dir=model_args.cache_dir)
        needed_keys = task_to_keys[data_args.task_name]
        for key in needed_keys:
            assert key in user_dataset[
                "data"].features, f"Your supplied predict_file is missing the {key} key!"
        datasets["user_data"] = user_dataset["data"]
    # endregion

    # region Load model config and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Dataset preprocessing
    sentence1_key, sentence2_key = task_to_keys[data_args.task_name]

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if config.label2id != PretrainedConfig(
            num_labels=num_labels).label2id and not is_regression:
        # Some have all caps in their config, some don't.
        label_name_to_id = {k.lower(): v for k, v in config.label2id.items()}
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {
                i: int(label_name_to_id[label_list[i]])
                for i in range(num_labels)
            }
        else:
            logger.warning(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
            )
            label_to_id = {label: i for i, label in enumerate(label_list)}
    if label_to_id is not None:
        config.label2id = label_to_id
        config.id2label = {id: label for label, id in config.label2id.items()}
    elif data_args.task_name is not None and not is_regression:
        config.label2id = {l: i for i, l in enumerate(label_list)}
        config.id2label = {id: label for label, id in config.label2id.items()}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args,
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)

    if data_args.pad_to_max_length:
        data_collator = DefaultDataCollator(return_tensors="tf")
    else:
        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
    # endregion

    # region Metric function
    metric = evaluate.load("glue", data_args.task_name)

    def compute_metrics(preds, label_ids):
        preds = preds["logits"]
        preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                  axis=1)
        result = metric.compute(predictions=preds, references=label_ids)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result

    # endregion

    with training_args.strategy.scope():
        # region Load pretrained model
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        # endregion

        # region Convert data to a tf.data.Dataset
        dataset_options = tf.data.Options()
        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
        num_replicas = training_args.strategy.num_replicas_in_sync

        tf_data = dict()
        max_samples = {
            "train": data_args.max_train_samples,
            "validation": data_args.max_eval_samples,
            "validation_matched": data_args.max_eval_samples,
            "validation_mismatched": data_args.max_eval_samples,
            "test": data_args.max_predict_samples,
            "test_matched": data_args.max_predict_samples,
            "test_mismatched": data_args.max_predict_samples,
            "user_data": None,
        }
        for key in datasets.keys():
            if key == "train" or key.startswith("validation"):
                assert "label" in datasets[
                    key].features, f"Missing labels from {key} data!"
            if key == "train":
                shuffle = True
                batch_size = training_args.per_device_train_batch_size * num_replicas
            else:
                shuffle = False
                batch_size = training_args.per_device_eval_batch_size * num_replicas
            samples_limit = max_samples[key]
            dataset = datasets[key]
            if samples_limit is not None:
                dataset = dataset.select(range(samples_limit))

            # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
            # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
            # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
            # yourself if you use this method, whereas they are automatically inferred from the model input names when
            # using model.prepare_tf_dataset()
            # For more info see the docs:
            # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
            # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
            data = model.prepare_tf_dataset(
                dataset,
                shuffle=shuffle,
                batch_size=batch_size,
                collate_fn=data_collator,
                tokenizer=tokenizer,
            )
            data = data.with_options(dataset_options)
            tf_data[key] = data
        # endregion

        # region Optimizer, loss and compilation
        if training_args.do_train:
            num_train_steps = len(
                tf_data["train"]) * training_args.num_train_epochs
            if training_args.warmup_steps > 0:
                num_warmup_steps = training_args.warmup_steps
            elif training_args.warmup_ratio > 0:
                num_warmup_steps = int(num_train_steps *
                                       training_args.warmup_ratio)
            else:
                num_warmup_steps = 0

            optimizer, schedule = create_optimizer(
                init_lr=training_args.learning_rate,
                num_train_steps=num_train_steps,
                num_warmup_steps=num_warmup_steps,
                adam_beta1=training_args.adam_beta1,
                adam_beta2=training_args.adam_beta2,
                adam_epsilon=training_args.adam_epsilon,
                weight_decay_rate=training_args.weight_decay,
                adam_global_clipnorm=training_args.max_grad_norm,
            )
        else:
            optimizer = "adam"  # Just write anything because we won't be using it
        if is_regression:
            metrics = []
        else:
            metrics = ["accuracy"]
        model.compile(optimizer=optimizer,
                      metrics=metrics,
                      jit_compile=training_args.xla)
        # endregion

        # region Preparing push_to_hub and model card
        push_to_hub_model_id = training_args.push_to_hub_model_id
        model_name = model_args.model_name_or_path.split("/")[-1]
        if not push_to_hub_model_id:
            push_to_hub_model_id = f"{model_name}-finetuned-glue"

        model_card_kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tasks": "text-classification"
        }
        model_card_kwargs["task_name"] = data_args.task_name

        if training_args.push_to_hub:
            callbacks = [
                PushToHubCallback(
                    output_dir=training_args.output_dir,
                    model_id=push_to_hub_model_id,
                    organization=training_args.push_to_hub_organization,
                    token=training_args.push_to_hub_token,
                    tokenizer=tokenizer,
                    **model_card_kwargs,
                )
            ]
        else:
            callbacks = []
        # endregion

        # region Training and validation
        if training_args.do_train:
            if training_args.do_eval and not data_args.task_name == "mnli":
                # Do both evaluation and training in the Keras fit loop, unless the task is MNLI
                # because MNLI has two validation sets
                validation_data = tf_data["validation"]
            else:
                validation_data = None
            model.fit(
                tf_data["train"],
                validation_data=validation_data,
                epochs=int(training_args.num_train_epochs),
                callbacks=callbacks,
            )
        # endregion

        # region Evaluation
        if training_args.do_eval:
            # We normally do validation as part of the Keras fit loop, but we run it independently
            # if there was no fit() step (because we didn't train the model) or if the task is MNLI,
            # because MNLI has a separate validation-mismatched validation set

            # In this example, we compute advanced metrics only at the end of training, and only compute
            # loss and accuracy on the validation set each epoch, but
            # if you'd like to compute metrics every epoch that are too complex to be written as
            # standard Keras metrics, you can use our KerasMetricCallback. See
            # https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
            logger.info("*** Evaluate ***")

            # Loop to handle MNLI double evaluation (matched, mis-matched)
            if data_args.task_name == "mnli":
                tasks = ["mnli", "mnli-mm"]
                tf_datasets = [
                    tf_data["validation_matched"],
                    tf_data["validation_mismatched"]
                ]
                raw_datasets = [
                    datasets["validation_matched"],
                    datasets["validation_mismatched"]
                ]
            else:
                tasks = [data_args.task_name]
                tf_datasets = [tf_data["validation"]]
                raw_datasets = [datasets["validation"]]

            for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets,
                                                     tasks):
                eval_predictions = model.predict(tf_dataset)
                eval_metrics = compute_metrics(eval_predictions,
                                               raw_dataset["label"])
                print(f"Evaluation metrics ({task}):")
                print(eval_metrics)
                if training_args.output_dir is not None:
                    output_eval_file = os.path.join(training_args.output_dir,
                                                    "all_results.json")
                    with open(output_eval_file, "w") as writer:
                        writer.write(json.dumps(eval_metrics))

        # endregion

        # region Prediction
        if training_args.do_predict or data_args.predict_file:
            logger.info("*** Predict ***")

            # Loop to handle MNLI double evaluation (matched, mis-matched)
            tasks = []
            tf_datasets = []
            raw_datasets = []
            if training_args.do_predict:
                if data_args.task_name == "mnli":
                    tasks.extend(["mnli", "mnli-mm"])
                    tf_datasets.extend(
                        [tf_data["test_matched"], tf_data["test_mismatched"]])
                    raw_datasets.extend([
                        datasets["test_matched"], datasets["test_mismatched"]
                    ])
                else:
                    tasks.append(data_args.task_name)
                    tf_datasets.append(tf_data["test"])
                    raw_datasets.append(datasets["test"])
            if data_args.predict_file:
                tasks.append("user_data")
                tf_datasets.append(tf_data["user_data"])
                raw_datasets.append(datasets["user_data"])

            for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets,
                                                     tasks):
                test_predictions = model.predict(tf_dataset)
                if "label" in raw_dataset:
                    test_metrics = compute_metrics(test_predictions,
                                                   raw_dataset["label"])
                    print(f"Test metrics ({task}):")
                    print(test_metrics)

                if is_regression:
                    predictions_to_write = np.squeeze(
                        test_predictions["logits"])
                else:
                    predictions_to_write = np.argmax(
                        test_predictions["logits"], axis=1)

                output_predict_file = os.path.join(
                    training_args.output_dir, f"predict_results_{task}.txt")
                with open(output_predict_file, "w") as writer:
                    logger.info(
                        f"***** Writing prediction results for {task} *****")
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions_to_write):
                        if is_regression:
                            writer.write(f"{index}\t{item:3.3f}\n")
                        else:
                            item = model.config.id2label[item]
                            writer.write(f"{index}\t{item}\n")
        # endregion

        if training_args.output_dir is not None and not training_args.push_to_hub:
            # If we're not pushing to hub, at least save a local copy when we're done
            model.save_pretrained(training_args.output_dir)
Пример #13
0
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline
import os

tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine")

path = '/model/tf_model.h5'

if os.path.exists(path):

    model = TFAutoModelForSequenceClassification.from_pretrained(
        './model/', local_files_only=True)

else:

    model = TFAutoModelForSequenceClassification.from_pretrained(
        "tblard/tf-allocine")
    model.save_pretrained('./model/')

nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
Пример #14
0
    x = Dropout(0.5)(x)
    x = Dense(32, activation=gelu)(cls_token)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy',AUC(name='auc')])
    return model


%%time
from transformers import TFAutoModel
from transformers import TFAutoModelForSequenceClassification
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained('jplu/tf-xlm-roberta-base')
    transformer_layer = TFAutoModelForSequenceClassification.from_pretrained('jplu/tf-xlm-roberta-base')
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()



# def build_model_working(transformer, max_len):
#     """
#     from transformers import TFAutoModelForSequenceClassification
#     transformer = TFAutoModelForSequenceClassification.from_pretrained('jplu/tf-xlm-roberta-base')
#     """
#     input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
#     sequence_output = transformer(input_word_ids)[0]
#     out = Dense(1, activation='sigmoid')(sequence_output)
#     model = Model(inputs=input_word_ids, outputs=out)
#     model.compile(Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy',AUC(name='auc')])
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(
        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
        training_args.n_replicas,
        bool(training_args.n_replicas > 1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    train_dataset, eval_dataset, test_ds, label2id = get_tfds(
        train_file=data_args.train_file,
        eval_file=data_args.dev_file,
        test_file=data_args.test_file,
        tokenizer=tokenizer,
        label_column_id=data_args.label_column_id,
        max_seq_length=data_args.max_seq_length,
    )

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=len(label2id),
        label2id=label2id,
        id2label={id: label
                  for label, id in label2id.items()},
        finetuning_task="text-classification",
        cache_dir=model_args.cache_dir,
    )

    with training_args.strategy.scope():
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_pt=bool(".bin" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)

        return {"acc": (preds == p.label_ids).mean()}

    # Initialize our Trainer
    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()
        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")

            for key, value in result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

            results.update(result)

    return results
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser(
        (ModelArguments, GlueDataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(
        "n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.n_gpu,
        bool(training_args.n_gpu > 1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    try:
        num_labels = glue_tasks_num_labels["mnli" if data_args.task_name ==
                                           "mnli-mm" else data_args.task_name]
        output_mode = glue_output_modes[data_args.task_name]
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    with training_args.strategy.scope():
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_pt=bool(".bin" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    # Get datasets
    train_dataset = (get_tfds(task_name=data_args.task_name,
                              tokenizer=tokenizer,
                              max_seq_length=data_args.max_seq_length)
                     if training_args.do_train else None)
    eval_dataset = (get_tfds(task_name=data_args.task_name,
                             tokenizer=tokenizer,
                             max_seq_length=data_args.max_seq_length,
                             mode=Split.dev)
                    if training_args.do_eval else None)

    def compute_metrics(p: EvalPrediction) -> Dict:
        if output_mode == "classification":
            preds = np.argmax(p.predictions, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(p.predictions)
        return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

    # Initialize our Trainer
    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()
        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")

            for key, value in result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

            results.update(result)

    return results
Пример #17
0
"""helper functions for the flask API."""
import numpy as np

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased',
                                          use_fast=True)

model = TFAutoModelForSequenceClassification.from_pretrained("spentaur/yelp")


def infer(review: str) -> float:
    """returns raw model sentiment prediction for a review."""
    return model(tokenizer.encode(review,
                                  return_tensors='tf'))[0].numpy()[0][0]


def squish(x: float) -> float:
    """converts model inferences to value between zero and one."""
    return np.clip(0.25 * x + 0.5, 0, 1)
Пример #18
0
    def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs):
        do_lower_case = False
        if 'uncased' in model_name.lower():
            do_lower_case = True
        tokenizer_kwargs.update({'do_lower_case': do_lower_case})

        self._tokenizer = None
        self._model = None
        self._config = None

        self._tokenizer = AutoTokenizer.from_pretrained(
            model_name, **tokenizer_kwargs)
        self._config = AutoConfig.from_pretrained(model_name)

        temporary_path = self._get_temporary_path()
        make_dir(temporary_path)

        # TensorFlow model
        try:
            self._model = TFAutoModelForSequenceClassification.from_pretrained(
                model_name, from_pt=False)

        # PyTorch model
        except TypeError:
            try:
                self._model = TFAutoModelForSequenceClassification.from_pretrained(
                    model_name, from_pt=True)

            # Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name
            except OSError:
                model = AutoModel.from_pretrained(model_name)
                model.save_pretrained(temporary_path)
                self._model = TFAutoModelForSequenceClassification.from_pretrained(
                    temporary_path, from_pt=True)

        # Clean the model's last layer if the provided properties are different
        clean_last_layer = False
        for key, value in model_kwargs.items():
            if not hasattr(self._model.config, key):
                clean_last_layer = True
                break

            if getattr(self._model.config, key) != value:
                clean_last_layer = True
                break

        if clean_last_layer:
            model_family = self._get_model_family()
            try:
                getattr(
                    self._model,
                    self._get_model_family()).save_pretrained(temporary_path)
                self._model = self._model.__class__.from_pretrained(
                    temporary_path, from_pt=False, **model_kwargs)

            # The model is itself the main layer
            except AttributeError:
                # TensorFlow model
                try:
                    self._model = self._model.__class__.from_pretrained(
                        model_name, from_pt=False, **model_kwargs)

                # PyTorch Model
                except (OSError, TypeError):
                    model = AutoModel.from_pretrained(model_name)
                    model.save_pretrained(temporary_path)
                    self._model = self._model.__class__.from_pretrained(
                        temporary_path, from_pt=True, **model_kwargs)

        remove_dir(temporary_path)
        assert self._tokenizer and self._model
Пример #19
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if not (training_args.do_train or training_args.do_eval
            or training_args.do_predict):
        exit(
            "Must specify at least one of --do_train, --do_eval or --do_predict!"
        )
    # endregion

    # region Checkpoints
    checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        checkpoint = get_last_checkpoint(training_args.output_dir)
        if checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # region Dataset and labels
    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee
    # that only one local process can concurrently download the dataset.
    datasets = load_dataset("glue",
                            data_args.task_name,
                            cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    is_regression = data_args.task_name == "stsb"
    if not is_regression:
        label_list = datasets["train"].features["label"].names
        num_labels = len(label_list)
    else:
        num_labels = 1

    if data_args.predict_file is not None:
        logger.info("Preparing user-supplied file for predictions...")

        data_files = {"data": data_args.predict_file}

        for key in data_files.keys():
            logger.info(f"Loading a local file for {key}: {data_files[key]}")

        if data_args.predict_file.endswith(".csv"):
            # Loading a dataset from local csv files
            user_dataset = load_dataset("csv",
                                        data_files=data_files,
                                        cache_dir=model_args.cache_dir)
        else:
            # Loading a dataset from local json files
            user_dataset = load_dataset("json",
                                        data_files=data_files,
                                        cache_dir=model_args.cache_dir)
        needed_keys = task_to_keys[data_args.task_name]
        for key in needed_keys:
            assert key in user_dataset[
                "data"].features, f"Your supplied predict_file is missing the {key} key!"
        datasets["user_data"] = user_dataset["data"]
    # endregion

    # region Load model config and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Dataset preprocessing
    sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    non_label_column_names = [
        name for name in datasets["train"].column_names if name != "label"
    ]

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if config.label2id != PretrainedConfig(
            num_labels=num_labels).label2id and not is_regression:
        # Some have all caps in their config, some don't.
        label_name_to_id = {k.lower(): v for k, v in config.label2id.items()}
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {
                i: int(label_name_to_id[label_list[i]])
                for i in range(num_labels)
            }
        else:
            logger.warning(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
            )
            label_to_id = {label: i for i, label in enumerate(label_list)}
    if label_to_id is not None:
        config.label2id = label_to_id
        config.id2label = {id: label for label, id in config.label2id.items()}
    elif data_args.task_name is not None and not is_regression:
        config.label2id = {l: i for i, l in enumerate(label_list)}
        config.id2label = {id: label for label, id in config.label2id.items()}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args,
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)

    # endregion

    # region Metric function
    metric = load_metric("glue", data_args.task_name)

    def compute_metrics(preds, label_ids):
        preds = preds["logits"]
        preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                  axis=1)
        result = metric.compute(predictions=preds, references=label_ids)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result

    # endregion

    with training_args.strategy.scope():
        # region Load pretrained model
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        # endregion

        # region Optimizer, loss and compilation
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=training_args.learning_rate,
            beta_1=training_args.adam_beta1,
            beta_2=training_args.adam_beta2,
            epsilon=training_args.adam_epsilon,
            clipnorm=training_args.max_grad_norm,
        )
        if is_regression:
            loss_fn = tf.keras.losses.MeanSquaredError()
            metrics = []
        else:
            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True)
            metrics = ["accuracy"]
        model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
        # endregion

        # region Convert data to a tf.data.Dataset
        tf_data = dict()
        if isinstance(
                training_args.strategy,
                tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
            logger.info(
                "Padding all batches to max length because argument was set or we're on TPU."
            )
            dataset_mode = "constant_batch"
        else:
            dataset_mode = "variable_batch"
        max_samples = {
            "train": data_args.max_train_samples,
            "validation": data_args.max_eval_samples,
            "validation_matched": data_args.max_eval_samples,
            "validation_mismatched": data_args.max_eval_samples,
            "test": data_args.max_predict_samples,
            "test_matched": data_args.max_predict_samples,
            "test_mismatched": data_args.max_predict_samples,
            "user_data": None,
        }
        for key in datasets.keys():
            if key == "train" or key.startswith("validation"):
                assert "label" in datasets[
                    key].features, f"Missing labels from {key} data!"
            if key == "train":
                shuffle = True
                batch_size = training_args.per_device_train_batch_size
                drop_remainder = True  # Saves us worrying about scaling gradients for the last batch
            else:
                shuffle = False
                batch_size = training_args.per_device_eval_batch_size
                drop_remainder = False
            samples_limit = max_samples[key]
            dataset = datasets[key]
            if samples_limit is not None:
                dataset = dataset.select(range(samples_limit))
            data = convert_dataset_for_tensorflow(
                dataset,
                non_label_column_names,
                batch_size=batch_size,
                dataset_mode=dataset_mode,
                drop_remainder=drop_remainder,
                shuffle=shuffle,
            )
            tf_data[key] = data
        # endregion

        # region Training and validation
        if training_args.do_train:
            callbacks = [
                SavePretrainedCallback(output_dir=training_args.output_dir)
            ]
            if training_args.do_eval and not data_args.task_name == "mnli":
                # Do both evaluation and training in the Keras fit loop, unless the task is MNLI
                # because MNLI has two validation sets
                validation_data = tf_data["validation"]
            else:
                validation_data = None
            model.fit(
                tf_data["train"],
                validation_data=validation_data,
                epochs=int(training_args.num_train_epochs),
                callbacks=callbacks,
            )
        # endregion

        # region Evaluation
        if training_args.do_eval:
            # We normally do validation as part of the Keras fit loop, but we run it independently
            # if there was no fit() step (because we didn't train the model) or if the task is MNLI,
            # because MNLI has a separate validation-mismatched validation set
            logger.info("*** Evaluate ***")

            # Loop to handle MNLI double evaluation (matched, mis-matched)
            if data_args.task_name == "mnli":
                tasks = ["mnli", "mnli-mm"]
                tf_datasets = [
                    tf_data["validation_matched"],
                    tf_data["validation_mismatched"]
                ]
                raw_datasets = [
                    datasets["validation_matched"],
                    datasets["validation_mismatched"]
                ]
            else:
                tasks = [data_args.task_name]
                tf_datasets = [tf_data["validation"]]
                raw_datasets = [datasets["validation"]]

            for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets,
                                                     tasks):
                eval_predictions = model.predict(tf_dataset)
                eval_metrics = compute_metrics(eval_predictions,
                                               raw_dataset["label"])
                print(f"Evaluation metrics ({task}):")
                print(eval_metrics)

        # endregion

        # region Prediction
        if training_args.do_predict or data_args.predict_file:
            logger.info("*** Predict ***")

            # Loop to handle MNLI double evaluation (matched, mis-matched)
            tasks = []
            tf_datasets = []
            raw_datasets = []
            if training_args.do_predict:
                if data_args.task_name == "mnli":
                    tasks.extend(["mnli", "mnli-mm"])
                    tf_datasets.extend(
                        [tf_data["test_matched"], tf_data["test_mismatched"]])
                    raw_datasets.extend([
                        datasets["test_matched"], datasets["test_mismatched"]
                    ])
                else:
                    tasks.append(data_args.task_name)
                    tf_datasets.append(tf_data["test"])
                    raw_datasets.append(datasets["test"])
            if data_args.predict_file:
                tasks.append("user_data")
                tf_datasets.append(tf_data["user_data"])
                raw_datasets.append(datasets["user_data"])

            for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets,
                                                     tasks):
                test_predictions = model.predict(tf_dataset)
                if "label" in raw_dataset:
                    test_metrics = compute_metrics(test_predictions,
                                                   raw_dataset["label"])
                    print(f"Test metrics ({task}):")
                    print(test_metrics)

                if is_regression:
                    predictions_to_write = np.squeeze(
                        test_predictions["logits"])
                else:
                    predictions_to_write = np.argmax(
                        test_predictions["logits"], axis=1)

                output_predict_file = os.path.join(
                    training_args.output_dir, f"predict_results_{task}.txt")
                with open(output_predict_file, "w") as writer:
                    logger.info(
                        f"***** Writing prediction results for {task} *****")
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions_to_write):
                        if is_regression:
                            writer.write(f"{index}\t{item:3.3f}\n")
                        else:
                            item = model.config.id2label[item]
                            writer.write(f"{index}\t{item}\n")
Пример #20
0
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 15 03:40:33 2021

@author: nashe
"""


import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}

from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer

MODEL_DIR = r"D:\Fine-tuned Models\NLP\bert\tf-distilbert-base-uncased\epoch_1_loss_0.39"

# Feature extraction pipeline
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(r"D:\Models\NLP\bert\tf-distilbert-base-uncased")

pipeline = TextClassificationPipeline(model=model,
                                      tokenizer=tokenizer,
                                      framework='tf',
                                      device=0)

result = pipeline("It was a good watch. But a little boring.")[0]