Пример #1
0
def main():
    """Fine-tune on summarization data"""

    # need this to save a fine-tuned model
    if os.path.isdir(args.model_dir):
        shutil.rmtree(args.model_dir)
    os.mkdir(args.model_dir)

    # import data provider (e.g. dtr, rel, or events)
    data = importlib.import_module(args.data_reader)

    # load pretrained T5 tokenizer
    tokenizer = T5Tokenizer.from_pretrained(args.model_name)

    # load a pretrained T5 model
    model = T5ForConditionalGeneration.from_pretrained(args.model_name)

    train_dataset = data.Data(xmi_dir=args.xmi_dir,
                              tokenizer=tokenizer,
                              max_input_length=args.max_input_length,
                              max_output_length=args.max_output_length,
                              partition='train',
                              n_files=args.n_files,
                              xml_ref_dir=None,
                              xml_out_dir=None)

    val_dataset = data.Data(xmi_dir=args.xmi_dir,
                            tokenizer=tokenizer,
                            max_input_length=args.max_input_length,
                            max_output_length=args.max_output_length,
                            partition='dev',
                            n_files=args.n_files,
                            xml_ref_dir=None,
                            xml_out_dir=None)

    training_args = Seq2SeqTrainingArguments(
        output_dir='./Results',
        num_train_epochs=args.n_epochs,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        learning_rate=args.learning_rate,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./Logs')

    trainer = Seq2SeqTrainer(model=model,
                             args=training_args,
                             train_dataset=train_dataset,
                             eval_dataset=val_dataset)

    trainer.train()
    trainer.save_model(args.model_dir)
    trainer.evaluate()
Пример #2
0
def train_evaluate(model, collate_fn, train_dataset, val_dataset, **kwargs):
    train_args = Seq2SeqTrainingArguments(**{
        **default_training_args,
        **kwargs
    })
    trainer = Seq2SeqTrainer(model=model,
                             args=train_args,
                             data_collator=collate_fn,
                             train_dataset=train_dataset,
                             eval_dataset=val_dataset)
    trainer.train()
    results = trainer.evaluate()
    return results
Пример #3
0
def instantiate_trainer(config):
    verbosity = config.get("verbosity", logging.INFO)
    t_logging.set_verbosity(verbosity)
    logger.setLevel(verbosity)

    # debug (see torch.autograd.detect_anomaly)
    set_detect_anomaly(bool(config.get("debug", False)))

    # model
    model_args = dict(name="ViT-B/32",
                      jit=False,
                      training=True,
                      Class="CLIPDecoder")
    model_args.update(config.get("model", {}))
    model_args["Class"] = getattr(clip.model, model_args["Class"])
    logger.info(f"loading model from pre-trained CLIP {model_args}...")
    model, image_preprocess = load(**model_args)

    # data
    train_dataset, eval_dataset = get_datasets(
        image_preprocess=image_preprocess, **config.get("dataset", {}))

    # training
    criterion_args = config.get("criterion", {})
    # get criterion class (e.g. nn.NLLLoss) by name
    CriterionClass = getattr(nn, criterion_args.pop("Class", "NLLLoss"))
    criterion = CriterionClass(**criterion_args)
    learner_args = config.get("learner", {})
    LearnerClass = getattr(sys.modules[__name__],
                           learner_args.pop("Class", "LanguageModel"))
    learner = LearnerClass(model, criterion)
    training_args = Seq2SeqTrainingArguments(**config.get("training", {}))
    trainer = CLIPTrainer(model=learner,
                          args=training_args,
                          data_collator=collate_batch,
                          train_dataset=train_dataset,
                          eval_dataset=eval_dataset,
                          compute_metrics=compute_metrics)
    # training callbacks
    for callback in config.get("callbacks", []):
        CallbackClass = getattr(trainer_callback, callback.pop("Class"))
        trainer.add_callback(CallbackClass(**callback))

    return trainer, training_args, config
Пример #4
0
 def generate(self, dataset):
     huggingface_model = self.convert_to_huggingface()
     huggingface_model.config.decoder_start_token_id = self.tokenizer.cls_token_id
     huggingface_model.config.eos_token_id = self.tokenizer.sep_token_id
     huggingface_model.config.pad_token_id = self.tokenizer.pad_token_id
     huggingface_model.config.vocab_size = huggingface_model.config.encoder.vocab_size
     huggingface_model.config.add_cross_attention = True
     huggingface_model.config.no_repeat_ngram_size = 3
     huggingface_model.config.early_stopping = True
     huggingface_model.config.length_penalty = 2.0
     huggingface_model.config.num_beams = 4
     util_args = Seq2SeqTrainingArguments(predict_with_generate=True,
                                          output_dir='./tmp')
     util = Seq2SeqTrainer(
         args=util_args,
         model=huggingface_model,
         compute_metrics=lambda pred: compute_metrics(pred, self.tokenizer),
         eval_dataset=dataset,
         tokenizer=self.tokenizer,
     )
     return util.predict(dataset)
Пример #5
0
batch_size = 1  # change to 16 for full training
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained(
    'bert-base-multilingual-cased', 'bert-base-multilingual-cased'
)  # initialize Bert2Bert from pre-trained checkpoints

bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size

training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    overwrite_output_dir=True,
    do_train=True,
    per_device_train_batch_size=batch_size,
    num_train_epochs=1,
    logging_steps=500,  # set to 1000 for full training
    save_steps=10000,  # set to 500 for full training
    warmup_steps=1000,  # set to 2000 for full training
    fp16=True)

trainer = Seq2SeqTrainer(
    model=bert2bert,
    args=training_args,
    train_dataset=train_data,
    #eval_dataset=valid_data,
    #compute_metrics=metric,
)

trainer.train()
Пример #6
0
def main():
    parser = argparse.ArgumentParser()

    # Input and output configs
    parser.add_argument("--data_folder", default=None, type=str, required=True,
                        help="the folder to save the processed data")
    parser.add_argument("--last_utterance_only", default=False, required=False, action="store_true",
                        help="Train with the whole context or the last utterance only")

    args = parser.parse_args()
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[
            logging.StreamHandler()
        ]
    )

    tasks = ['mantis', 'msdialog', 'ubuntu_dstc8']
    #Downloading Conversation Response Ranking
    for task in tasks:
        if not os.path.isdir(args.data_folder+task):
            logging.info("Starting downloader for task {}".format(task))
            dataDownloader = downloader.DataDownloader(task, args.data_folder)
            dataDownloader.download_and_preprocess()
    
    all_df = []
    for task in tasks:
        train = pd.read_csv(args.data_folder+task+"/train.tsv", sep="\t")
        train['task'] = task
        replace = train.shape[0]<80000
        train = train.sample(80000, replace=replace)
        all_df.append(train)
    all_df = pd.concat(all_df)

    def preprocess_response(r):
        # some tokens that only appear in MSDialogue are removed here
        r = r.replace("<<<AGENT>>>:", "")
        r = r.replace("PERSON_PLACEHOLDER", "")
        r = r.replace("AGENT", "")
        return r

    def preprocess_context(r):
        #removes beginning of context and keeps only last utterance.
        if 'msdialog' in r['task']:
            context = r['context'].split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[0].strip()
        else:
            context = r['context'].split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[-2].strip()
        return context

    all_df["response"] = all_df.apply(lambda r,f=preprocess_response: f(r['response']), axis=1)
    if args.last_utterance_only:
        all_df["context"] = all_df.apply(lambda r,f=preprocess_context: f(r), axis=1)

    dataset = Dataset.from_pandas(all_df)

    # all_df["len_context"] = all_df.apply(lambda r: len(r['context'].split(" ")), axis=1)
    # all_df["len_response"] = all_df.apply(lambda r: len(r['response'].split(" ")), axis=1)

    model_checkpoint = "t5-base" #["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    max_input_length = 100
    if args.last_utterance_only:
        max_target_length = 100    
    else:
        max_target_length = 400

    col_from = "response"
    col_to = "context"

    def preprocess_function(examples):
        inputs = [preprocess_response(doc) for doc in examples[col_from]]
        model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer([t for t in examples[col_to]], max_length=max_target_length, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    
    tokenized_datasets = dataset.map(preprocess_function, batched=True)

    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    batch_size = 5
    train_args = Seq2SeqTrainingArguments(
        "response2context_lu_{}".format(args.last_utterance_only),
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=2,        
        predict_with_generate=True,
        seed=42
    )
    
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    trainer = Seq2SeqTrainer(
        model,
        train_args,
        train_dataset=tokenized_datasets,
        data_collator=data_collator,
        tokenizer=tokenizer
    )
    print("Fine-tuning T5.")
    trainer.train()
    if args.last_utterance_only:
        model.save_pretrained("{}/{}_response2context_last_utt_only".format(args.data_folder, model_checkpoint))
    else:
        model.save_pretrained("{}/{}_response2context".format(args.data_folder, model_checkpoint))
Пример #7
0
def main():
    args = parse_args()

    # this requires having preprocessed framenet
    # using `frame.cli:preprocess-framenet
    paths = data_paths(args.data)
    dataset = load_dataset('json', data_files=paths)
    metric = load_metric("rouge")

    train_test = dataset["train"].train_test_split(test_size=0.1)
    test_valid = train_test["test"].train_test_split(test_size=0.5)

    datasets = DatasetDict({
        "train": train_test["train"],
        "test": test_valid["test"],
        "valid": test_valid["train"]
    })

    tokenizer = AutoTokenizer.from_pretrained(args.model)

    # the family of t5 models expect input sentences to be prefixed with `"summarize: "`
    if args.model in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
        prefix = "summarize: "
    else:
        prefix = ""

    # HuggingFace loves to use closures, I would prefer this
    # be refactored into the library, but going this route for simplicity
    def preprocess_function(examples):
        """Tokenize the data for Seq2Seq

        Maps over all the examples in the dataset 
        to tokenize both the input framenet sentences
        and the target frame definitions.

        Args:
            examples: samples in the dataset
        """
        inputs = [prefix + sent for sent in examples["sentence"]]

        model_inputs = tokenizer(inputs,
                                 max_length=args.max_input_length,
                                 truncation=True)

        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples["frame_definition"],
                               max_length=args.max_target_length,
                               truncation=True)

        model_inputs["labels"] = labels["input_ids"]

        return model_inputs

    # again, with the closures - requires instance of the tokenizer
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions,
                                               skip_special_tokens=True)
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels,
                                                skip_special_tokens=True)

        # Rouge expects a newline after each sentence
        decoded_preds = [
            "\n".join(nltk.sent_tokenize(pred.strip()))
            for pred in decoded_preds
        ]
        decoded_labels = [
            "\n".join(nltk.sent_tokenize(label.strip()))
            for label in decoded_labels
        ]

        result = metric.compute(predictions=decoded_preds,
                                references=decoded_labels,
                                use_stemmer=True)
        # Extract a few results
        result = {
            key: value.mid.fmeasure * 100
            for key, value in result.items()
        }

        # Add mean generated length
        prediction_lens = [
            np.count_nonzero(pred != tokenizer.pad_token_id)
            for pred in predictions
        ]
        result["gen_len"] = np.mean(prediction_lens)

        return {k: round(v, 4) for k, v in result.items()}

    tokenized_datasets = datasets.map(preprocess_function, batched=True)

    model = AutoModelForSeq2SeqLM.from_pretrained(args.model)
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    training_args = Seq2SeqTrainingArguments(
        "./results/summarization",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=args.epochs,
        predict_with_generate=True,
        fp16=True,
    )

    trainer = Seq2SeqTrainer(model,
                             training_args,
                             train_dataset=tokenized_datasets["train"],
                             eval_dataset=tokenized_datasets["valid"],
                             data_collator=data_collator,
                             tokenizer=tokenizer,
                             compute_metrics=compute_metrics)

    trainer.train()
    def test_finetune_bert2bert(self):
        """
        Currently fails with:

        ImportError: To be able to use this metric, you need to install the following dependencies['absl', 'nltk', 'rouge_score']
        """

        bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained(
            "prajjwal1/bert-tiny", "prajjwal1/bert-tiny")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size
        bert2bert.config.eos_token_id = tokenizer.sep_token_id
        bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
        bert2bert.config.max_length = 128

        train_dataset = datasets.load_dataset("cnn_dailymail",
                                              "3.0.0",
                                              split="train[:1%]")
        val_dataset = datasets.load_dataset("cnn_dailymail",
                                            "3.0.0",
                                            split="validation[:1%]")

        train_dataset = train_dataset.select(range(32))
        val_dataset = val_dataset.select(range(16))

        rouge = datasets.load_metric("rouge")

        batch_size = 4

        def _map_to_encoder_decoder_inputs(batch):
            # Tokenizer will automatically set [BOS] <text> [EOS]
            inputs = tokenizer(batch["article"],
                               padding="max_length",
                               truncation=True,
                               max_length=512)
            outputs = tokenizer(batch["highlights"],
                                padding="max_length",
                                truncation=True,
                                max_length=128)
            batch["input_ids"] = inputs.input_ids
            batch["attention_mask"] = inputs.attention_mask

            batch["decoder_input_ids"] = outputs.input_ids
            batch["labels"] = outputs.input_ids.copy()
            batch["labels"] = [[
                -100 if token == tokenizer.pad_token_id else token
                for token in labels
            ] for labels in batch["labels"]]
            batch["decoder_attention_mask"] = outputs.attention_mask

            assert all([len(x) == 512 for x in inputs.input_ids])
            assert all([len(x) == 128 for x in outputs.input_ids])

            return batch

        def _compute_metrics(pred):
            labels_ids = pred.label_ids
            pred_ids = pred.predictions

            # all unnecessary tokens are removed
            pred_str = tokenizer.batch_decode(pred_ids,
                                              skip_special_tokens=True)
            label_str = tokenizer.batch_decode(labels_ids,
                                               skip_special_tokens=True)

            rouge_output = rouge.compute(predictions=pred_str,
                                         references=label_str,
                                         rouge_types=["rouge2"])["rouge2"].mid

            return {
                "rouge2_precision": round(rouge_output.precision, 4),
                "rouge2_recall": round(rouge_output.recall, 4),
                "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
            }

        # map train dataset
        train_dataset = train_dataset.map(
            _map_to_encoder_decoder_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "highlights"],
        )
        train_dataset.set_format(
            type="torch",
            columns=[
                "input_ids", "attention_mask", "decoder_input_ids",
                "decoder_attention_mask", "labels"
            ],
        )

        # same for validation dataset
        val_dataset = val_dataset.map(
            _map_to_encoder_decoder_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "highlights"],
        )
        val_dataset.set_format(
            type="torch",
            columns=[
                "input_ids", "attention_mask", "decoder_input_ids",
                "decoder_attention_mask", "labels"
            ],
        )

        output_dir = self.get_auto_remove_tmp_dir()

        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            predict_with_generate=True,
            evaluation_strategy="steps",
            do_train=True,
            do_eval=True,
            warmup_steps=0,
            eval_steps=2,
            logging_steps=2,
        )

        # instantiate trainer
        trainer = Seq2SeqTrainer(
            model=bert2bert,
            args=training_args,
            compute_metrics=_compute_metrics,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
        )

        # start training
        trainer.train()
Пример #9
0
#     eval_steps=500,  # 4 or set to 8000 for full training
#     warmup_steps=500,  # 1 or set to 2000 for full training
#     max_steps=2500,  # 16 or comment for full training
#     overwrite_output_dir=True,
#     save_total_limit=3,
#     fp16=torch.cuda.is_available(),
# )

training_args = Seq2SeqTrainingArguments(
    output_dir='./',
    evaluation_strategy='steps',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=50,  # 2 or set to 1000 for full training
    save_steps=50,  # 16 or set to 500 for full training
    eval_steps=50,  # 4 or set to 8000 for full training
    warmup_steps=50,  # 1 or set to 2000 for full training
    max_steps=850,  # 16 or comment for full training
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=torch.cuda.is_available(),
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=ed_model,
    tokenizer=input_tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
Пример #10
0
def train_translationmodel(args):
    dataset_properties = json.load(open(os.path.join(args.data_dir, "dataset_properties.json")))
    special_tokens = dataset_properties["special_tokens"]
    target_vocab = dataset_properties["target_vocab"]

    target_model = os.path.join(args.model_root_dir, args.run_id, args.target_model_name)
    output_dir = os.path.join(args.model_root_dir, args.run_id, args.translation_model_name)
    logging_dir = os.path.join(output_dir, "logs")
    if args.resume == False:
        checkpoint = None
        os.mkdir(output_dir)
        # copy info about dataset b/c we'll need that when running the dockerized model (among others, it contains the target vocab)
        copyfile(os.path.join(args.data_dir, "dataset_properties.json"), os.path.join(output_dir, "dataset_properties.json"))
    else:
        checkpoint = get_last_checkpoint(output_dir)
        print(f"trying to resume training from {checkpoint} in {output_dir}")

    # use mixed precision training on CUDA devices, otherwise disable it so that code can run on CPUs
    fp16 = True if torch.cuda.is_available() else False

    bert2arsenal = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", target_model)
    source_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    source_tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

    # save it for later use s.t. we don't have to download anything for the runtime
    source_tokenizer.save_pretrained(os.path.join(output_dir, "source_tokenizer"))

    # only needed to get the id of the EOS token in the target language
    target_tokenizer = PreTrainedArsenalTokenizer(target_vocab=target_vocab)

    # Due to the additional special tokens, encoder token embeddings need to be resized.
    # The target model has been created specifically for the "Effigy Arsenal Language" so has already correct dims
    bert2arsenal.encoder.resize_token_embeddings(len(source_tokenizer))
    bert2arsenal.config.decoder_start_token_id = source_tokenizer.cls_token_id
    bert2arsenal.config.eos_token_id = target_tokenizer.sep_token_id

    # not sure whether these settings are relevant? (At least they shouldn't be harmful)
    bert2arsenal.config.encoder.eos_token_id = source_tokenizer.sep_token_id
    bert2arsenal.config.decoder.eos_token_id = target_tokenizer.sep_token_id

    bert2arsenal.config.pad_token_id = source_tokenizer.pad_token_id
    bert2arsenal.config.vocab_size = bert2arsenal.encoder.vocab_size
    bert2arsenal.config.encoder.vocab_size = bert2arsenal.encoder.vocab_size

    # the model has min/max length settings in three places: for the main moder (EncoderDecoder) and both encoder
    # and decoder as submodels. Settings in the latter two parts seem to be completely irrelevant (unless one would
    # try to use the trained encoder or decoder parts from the translation model in isolation).
    bert2arsenal.config.max_length = dataset_properties["decoder_max_len"]
    bert2arsenal.config.min_length = dataset_properties["decoder_min_len"]

    # Don't prevent any n-gram repetitions! This would have a significant negative influence on
    # the translations (especially for longer sentences), because the correct CSTs may contain n-gram repetitions
    bert2arsenal.config.no_repeat_ngram_size = 0
    bert2arsenal.config.early_stopping = True
    bert2arsenal.config.length_penalty = 2.0
    bert2arsenal.config.num_beams = 4
    # bert2arsenal.config.add_cross_attention
    # bert2arsenal.config.num_return_sequences = 5 # this can be used to set the number of return sequences

    print(f"model config:\n{bert2arsenal.config}")

    training_args = Seq2SeqTrainingArguments(
        predict_with_generate=True,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        fp16=fp16,
        output_dir=output_dir,
        logging_dir=logging_dir,
        logging_steps=args.logging_steps,
        save_steps=args.save_steps,
        save_total_limit=args.save_total_limit,
        warmup_steps=args.warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=args.weight_decay,  # strength of weight decay
        num_train_epochs=args.translation_epochs,
    )

    bert2arsenal.config.to_json_file(os.path.join(output_dir, "model_config.json"))
    with open(os.path.join(output_dir, "training_args.json"), "w") as f:
        f.write(str(training_args.to_json_string()))

    train_data = datasets.Dataset.load_from_disk(os.path.join(args.data_dir, args.train_dataset_name))

    train_data.set_format(
        type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
    )

    trainer = Seq2SeqTrainer(
        model=bert2arsenal,
        args=training_args,
        train_dataset=train_data,
        tokenizer=source_tokenizer
    )
    print(f"start training at {datetime.now().strftime('%b%d_%H-%M-%S')}")
    trainer.train(resume_from_checkpoint=checkpoint)
    trainer.save_model()
Пример #11
0
def main(args):
    df = pd.read_csv(args.input_fname,
                     encoding='utf-8')[[args.source_lang, args.target_lang]]
    logging.info(f'Loaded {df.shape}')

    #convert to dictionary
    j = {'translation': []}
    for i in df.itertuples():
        j['translation'] += [{args.source_lang: i[1], args.target_lang: i[2]}]

    train_dataset = Dataset.from_dict(j)
    raw_datasets = train_dataset.train_test_split(test_size=args.valid_pct,
                                                  seed=args.seed)
    logging.info(f'Datasets created {raw_datasets}')

    tokenizer = MarianTokenizer.from_pretrained(args.output_dir)
    logging.info(f'Tokenizer loaded from {args.output_dir}')

    #tokenize datasets
    tokenized_datasets = raw_datasets.map(
        partial(preprocess_function,
                tokenizer=tokenizer,
                max_input_length=args.max_input_length,
                max_target_length=args.max_target_length,
                source_lang=args.source_lang,
                target_lang=args.target_lang),
        batched=True,
    )
    logging.info(f'Tokenized datasets: {tokenized_datasets}')

    #filter those with too few tokens
    tokenized_datasets = tokenized_datasets.filter(
        lambda example: len(example['translation']['zh']) > 2)
    tokenized_datasets = tokenized_datasets.filter(
        lambda example: len(example['translation']['th']) > 2)
    logging.info(
        f'Tokenized datasets when filtered out less than 2 tokens per sequence: {tokenized_datasets}'
    )

    config = MarianConfig.from_pretrained(args.output_dir)
    model = MarianMTModel(config)
    logging.info(f'Loaded model from {args.output_dir}')

    training_args = Seq2SeqTrainingArguments(
        args.output_dir,
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=args.learning_rate,
        warmup_ratio=args.warmup_ratio,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        weight_decay=args.weight_decay,
        save_total_limit=args.save_total_limit,
        num_train_epochs=args.num_train_epochs,
        predict_with_generate=True,
        fp16=args.fp16,
        seed=args.seed,
    )
    logging.info(f'Training congig {training_args}')

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    trainer = Seq2SeqTrainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=partial(compute_metrics,
                                tokenizer=tokenizer,
                                metric=metric,
                                metric_tokenize=args.metric_tokenize),
    )
    logging.info(f'Trainer created')

    trainer.train()

    model.save_pretrained(f"{args.output_dir}_best")
    tokenizer.save_pretrained(f"{args.output_dir}_best")
    logging.info(f'Best model saved')

    model.cpu()
    src_text = ['我爱你', '国王有很多心事。我明白']
    translated = model.generate(
        **tokenizer(src_text, return_tensors="pt", padding=True))
    print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])
Пример #12
0
def main():
    """Fine-tune on summarization data"""

    # need this to save a fine-tuned model
    if os.path.isdir(args.model_dir):
        shutil.rmtree(args.model_dir)
    os.mkdir(args.model_dir)

    # import data provider (e.g. dtr, rel, or events)
    data = importlib.import_module(args.data_reader)

    # load pretrained T5 tokenizer
    tokenizer = T5Tokenizer.from_pretrained(args.model_name)

    # load a pretrained T5 model
    model = T5ForConditionalGeneration.from_pretrained(args.model_name)

    train_dataset = data.Data(xml_dir=args.xml_train_dir,
                              text_dir=args.text_train_dir,
                              out_dir=args.xml_out_dir,
                              xml_regex=args.xml_regex,
                              tokenizer=tokenizer,
                              max_input_length=args.max_input_length,
                              max_output_length=args.max_output_length)

    test_dataset = data.Data(xml_dir=args.xml_test_dir,
                             text_dir=args.text_test_dir,
                             out_dir=args.xml_out_dir,
                             xml_regex=args.xml_regex,
                             tokenizer=tokenizer,
                             max_input_length=args.max_input_length,
                             max_output_length=args.max_output_length)

    training_args = Seq2SeqTrainingArguments(
        output_dir='./Results',
        num_train_epochs=args.n_epochs,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        learning_rate=args.learning_rate,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./Logs',
        disable_tqdm=True,
        predict_with_generate=True,
        load_best_model_at_end=True)

    trainer = Seq2SeqTrainer(model=model,
                             args=training_args,
                             train_dataset=train_dataset,
                             eval_dataset=test_dataset)

    trainer.train()
    trainer.save_model(args.model_dir)
    print('done training...')

    results = trainer.predict(test_dataset=test_dataset,
                              max_length=args.max_output_length,
                              num_beams=1)

    predictions = tokenizer.batch_decode(results.predictions,
                                         skip_special_tokens=True,
                                         clean_up_tokenization_spaces=True)

    for prediction in predictions:
        print(prediction)
# print(text_tokenizer.convert_ids_to_tokens(inp))
# input_ids = torch.tensor(inp).unsqueeze(0)  # Batch size 1

# outp = code_tokenizer.encode('i += 2 ;')
# print(outp.tokens)
# decoder_input_ids = torch.tensor(outp.ids).unsqueeze(0)
# print(input_ids, input_ids.shape)
# print(decoder_input_ids, decoder_input_ids.shape)

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    fp16=True, 
    output_dir="./checkpoints/",
    logging_steps=4000,
    save_steps=1000,
    eval_steps=4000,
    warmup_steps=100,
    save_total_limit=5,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    # compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=eval_data,
)
trainer.train(resume_from_checkpoint='./checkpoints-new/checkpoint-3000')
Пример #14
0
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


args = Seq2SeqTrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    weight_decay=0.01,
    logging_dir='./logs/',
    logging_steps=100,
    learning_rate=5e-05,
    warmup_steps=200,
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=3000,
    seed=0,
    load_best_model_at_end=True,
    predict_with_generate=True,
)
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
Пример #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_model",
                        default='t5-base',
                        type=str,
                        required=False,
                        help="Base model to fine tune.")
    parser.add_argument("--triples_path",
                        default=None,
                        type=str,
                        required=True,
                        help="Triples.tsv path")
    parser.add_argument("--output_model_path",
                        default=None,
                        type=str,
                        required=True,
                        help="Path for trained model and checkpoints.")
    parser.add_argument("--save_every_n_steps",
                        default=0,
                        type=int,
                        required=False,
                        help="Save every N steps. (recommended 10000)")
    parser.add_argument("--logging_steps",
                        default=100,
                        type=int,
                        required=False,
                        help="Logging steps parameter.")
    parser.add_argument("--per_device_train_batch_size",
                        default=8,
                        type=int,
                        required=False,
                        help="Per device batch size parameter.")
    parser.add_argument("--gradient_accumulation_steps",
                        default=16,
                        type=int,
                        required=False,
                        help="Gradient accumulation parameter.")
    parser.add_argument("--learning_rate",
                        default=3e-4,
                        type=float,
                        required=False,
                        help="Learning rate parameter.")
    parser.add_argument("--epochs",
                        default=10,
                        type=int,
                        required=False,
                        help="Number of epochs to train")

    device = torch.device('cuda')
    torch.manual_seed(123)
    args = parser.parse_args()

    model = AutoModelForSeq2SeqLM.from_pretrained(args.base_model)
    tokenizer = AutoTokenizer.from_pretrained('t5-base')

    train_samples = []
    with open(args.triples_path, 'r', encoding="utf-8") as fIn:
        for num, line in enumerate(fIn):
            if num > 6.4e5 * args.epochs:
                break
            query, positive, negative = line.split("\t")
            train_samples.append((query, positive, 'true'))
            train_samples.append((query, negative, 'false'))

    def smart_batching_collate_text_only(batch):
        texts = [example['text'] for example in batch]
        tokenized = tokenizer(texts,
                              padding=True,
                              truncation='longest_first',
                              return_tensors='pt',
                              max_length=512)
        tokenized['labels'] = tokenizer(
            [example['labels'] for example in batch],
            return_tensors='pt')['input_ids']

        for name in tokenized:
            tokenized[name] = tokenized[name].to(device)

        return tokenized

    dataset_train = MonoT5Dataset(train_samples)

    if args.save_every_n_steps:
        steps = args.save_every_n_steps
        strategy = 'steps'
    else:
        steps = 1
        strategy = 'epoch'

    train_args = Seq2SeqTrainingArguments(
        output_dir=args.output_model_path,
        do_train=True,
        save_strategy=strategy,
        save_steps=steps,
        logging_steps=args.logging_steps,
        per_device_train_batch_size=args.per_device_train_batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        learning_rate=args.learning_rate,
        weight_decay=5e-5,
        num_train_epochs=1,
        warmup_steps=1000,
        adafactor=True,
        seed=1,
        disable_tqdm=False,
        load_best_model_at_end=False,
        predict_with_generate=True,
        dataloader_pin_memory=False,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=train_args,
        train_dataset=dataset_train,
        tokenizer=tokenizer,
        data_collator=smart_batching_collate_text_only,
    )

    trainer.train()

    trainer.save_model(args.output_model_path)
    trainer.save_state()
Пример #16
0
def train(args):
    logger.info("Loading tokenizer...\n")
    global tokenizer
    global model_name
    model_name = args.model_name
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    logger.info("Loading pretrained model\n")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    logger.info("Pretrained model loaded\n")

    logger.info("Fetching and tokenizing data for training")
    train_dataset = load_and_tokenize_dataset(
        args.train_data_dir,
        "train",
        args.text_column,
        args.target_column,
        args.max_source,
        args.max_target,
    )

    logger.info("Tokenizing data for training loaded")

    eval_dataset = load_and_tokenize_dataset(
        args.train_data_dir,
        "validation",
        args.text_column,
        args.target_column,
        args.max_source,
        args.max_target,
    )
    test_dataset = load_and_tokenize_dataset(
        args.train_data_dir,
        "test",
        args.text_column,
        args.target_column,
        args.max_source,
        args.max_target,
    )

    logger.info("Defining training arguments\n")
    training_args = Seq2SeqTrainingArguments(
        output_dir=args.model_dir,
        num_train_epochs=args.epoch,
        per_device_train_batch_size=args.train_batch_size,
        per_device_eval_batch_size=args.eval_batch_size,
        learning_rate=args.lr,
        warmup_steps=args.warmup_steps,
        weight_decay=args.weight_decay,
        logging_dir=args.log_dir,
        logging_strategy=args.logging_strategy,
        load_best_model_at_end=True,
        adafactor=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
        save_total_limit=3,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        predict_with_generate=True,
        metric_for_best_model="eval_loss",
        seed=7,
    )

    logger.info("Defining seq2seq Trainer")
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    logger.info("Starting Training")
    trainer.train()
    logger.info("Model trained successfully")
    trainer.save_model()
    logger.info("Model saved successfully")

    # Evaluation
    logger.info("*** Evaluate on test set***")

    logger.info(trainer.predict(test_dataset))

    logger.info("Removing unused checkpoints to save space in container")
    os.system(f"rm -rf {args.model_dir}/checkpoint-*/")
Пример #17
0
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

batch_size = 16
args = Seq2SeqTrainingArguments(
    "test-summarization",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions,
                                           skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
Пример #18
0
    # print(f'fuse 12: {(fuse_12_weight == fused_model.model.encoder.layers[-1].fuse_layer.weight.data.detach()).all()}')

# Freeze M2M layers before 12th encoder layer
modules = [fused_model.model.shared, *fused_model.model.encoder.layers[:11]]
for module in modules:
    for param in module.parameters():
        param.requires_grad = False

# Train
batch_size = args.batch_size
trainer_args = Seq2SeqTrainingArguments(
    args.checkpoint_path,
    evaluation_strategy="steps",
    learning_rate=args.learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=args.weight_decay,
    save_total_limit=3,
    num_train_epochs=args.num_train_epochs,
    predict_with_generate=True,
    fp16=True,
)
data_collator = DataCollatorForSeq2Seq(m2m_tokenizer, model=fused_model)

trainer = Seq2SeqTrainer(
    fused_model,
    trainer_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=m2m_tokenizer,
)
    def test_finetune_bert2bert(self):
        bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("prajjwal1/bert-tiny", "prajjwal1/bert-tiny")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size
        bert2bert.config.eos_token_id = tokenizer.sep_token_id
        bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
        bert2bert.config.max_length = 128

        train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
        val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")

        train_dataset = train_dataset.select(range(32))
        val_dataset = val_dataset.select(range(16))

        batch_size = 4

        def _map_to_encoder_decoder_inputs(batch):
            # Tokenizer will automatically set [BOS] <text> [EOS]
            inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512)
            outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=128)
            batch["input_ids"] = inputs.input_ids
            batch["attention_mask"] = inputs.attention_mask

            batch["decoder_input_ids"] = outputs.input_ids
            batch["labels"] = outputs.input_ids.copy()
            batch["labels"] = [
                [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
            ]
            batch["decoder_attention_mask"] = outputs.attention_mask

            assert all([len(x) == 512 for x in inputs.input_ids])
            assert all([len(x) == 128 for x in outputs.input_ids])

            return batch

        def _compute_metrics(pred):
            labels_ids = pred.label_ids
            pred_ids = pred.predictions

            # all unnecessary tokens are removed
            pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
            label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

            accuracy = sum([int(pred_str[i] == label_str[i]) for i in range(len(pred_str))]) / len(pred_str)

            return {"accuracy": accuracy}

        # map train dataset
        train_dataset = train_dataset.map(
            _map_to_encoder_decoder_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "highlights"],
        )
        train_dataset.set_format(
            type="torch",
            columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
        )

        # same for validation dataset
        val_dataset = val_dataset.map(
            _map_to_encoder_decoder_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "highlights"],
        )
        val_dataset.set_format(
            type="torch",
            columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
        )

        output_dir = self.get_auto_remove_tmp_dir()

        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            predict_with_generate=True,
            evaluation_strategy="steps",
            do_train=True,
            do_eval=True,
            warmup_steps=0,
            eval_steps=2,
            logging_steps=2,
        )

        # instantiate trainer
        trainer = Seq2SeqTrainer(
            model=bert2bert,
            args=training_args,
            compute_metrics=_compute_metrics,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
        )

        # start training
        trainer.train()
Пример #20
0
                 for i, pred in enumerate(predictions)]
    references = [{'id': str(i), 'reference': ref.strip().lower()} \
                for i, ref in enumerate(references)]'''

model = MT5ForConditionalGeneration.from_pretrained('mt5small')
'''device = torch.device("cpu")
model.to(device)
print(next(model.parameters()).device)'''

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=WARMUP_STEPS,
    gradient_accumulation_steps=8,
#    weight_decay=WEIGHT_DECAY,
    logging_dir='./logs/',
    evaluation_strategy="epoch",
    logging_steps=LOGGING_STEPS,
    learning_rate=LEARNING_RATE,
    predict_with_generate=True,
)

model.get_output_embeddings().weight.requires_grad=False
model.get_input_embeddings().weight.requires_grad=False

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],