Пример #1
0
def main(config):
    os.environ["WANDB_WATCH"] = "False"  # To disable Huggingface logging

    auto_generated_dir = os.getcwd()
    log.info(f"Work dir: {auto_generated_dir}")
    os.chdir(hydra.utils.get_original_cwd())

    wandb_run = init_wandb(auto_generated_dir, config)

    args_train = TrainingArguments(output_dir=auto_generated_dir)
    args_train = update_config(args_train, config.training)

    args_data = DataTrainingArguments(task_name=config.data.task_name,
                                      data_dir=config.data.data_dir)
    args_data = update_config(args_data, config.data)

    train_eval_glue_model(config, args_train, args_data, auto_generated_dir)
Пример #2
0
    def test_custom_optimizer(self):
        train_dataset = RegressionDataset()
        args = TrainingArguments("./regression")
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer, lr_lambda=lambda x: 1.0)
        trainer = Trainer(model,
                          args,
                          train_dataset=train_dataset,
                          optimizers=(optimizer, lr_scheduler))
        trainer.train()

        self.assertTrue(torch.abs(trainer.model.a - 1.8950) < 1e-4)
        self.assertTrue(torch.abs(trainer.model.b - 2.5656) < 1e-4)
        self.assertEqual(
            trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
Пример #3
0
def load_training_arguments():
    training_arguments = TrainingArguments(output_dir=recognizer_dir,
                                           group_by_length=True,
                                           per_device_train_batch_size=16,
                                           gradient_accumulation_steps=2,
                                           evaluation_strategy='steps',
                                           num_train_epochs=30,
                                           gradient_checkpointing=True,
                                           fp16=True,
                                           save_steps=400,
                                           eval_steps=400,
                                           logging_steps=400,
                                           learning_rate=3e-4,
                                           warmup_steps=500,
                                           save_total_limit=2,
                                           push_to_hub=False)
    return training_arguments
Пример #4
0
 def _get_training_args(dataclass_args, output_path):
     """
     :param args: a dataclass of arguments for training
     :param output_path: A string to a temporary directory
     :return: A TrainingArguments object
     """
     return TrainingArguments(
         output_dir=output_path,
         learning_rate=dataclass_args.learning_rate,
         weight_decay=dataclass_args.weight_decay,
         adam_beta1=dataclass_args.adam_beta1,
         adam_beta2=dataclass_args.adam_beta2,
         adam_epsilon=dataclass_args.adam_epsilon,
         max_grad_norm=dataclass_args.max_grad_norm,
         num_train_epochs=dataclass_args.num_train_epochs,
         report_to=["none"],
         per_device_train_batch_size=dataclass_args.batch_size)
Пример #5
0
    def test_model_init(self):
        train_dataset = RegressionDataset()
        args = TrainingArguments("./regression", learning_rate=0.1)
        trainer = Trainer(args=args,
                          train_dataset=train_dataset,
                          model_init=lambda: RegressionModel())
        trainer.train()
        self.check_trained_model(trainer.model)

        # Re-training should restart from scratch, thus lead the same results.
        trainer.train()
        self.check_trained_model(trainer.model)

        # Re-training should restart from scratch, thus lead the same results and new seed should be used.
        trainer.args.seed = 314
        trainer.train()
        self.check_trained_model(trainer.model, alternate_seed=True)
Пример #6
0
 def get_regression_trainer(a=0, b=0, train_len=64, eval_len=64, **kwargs):
     train_dataset = RegressionDataset(length=train_len)
     eval_dataset = RegressionDataset(length=eval_len)
     model = RegressionModel(a, b)
     compute_metrics = kwargs.pop("compute_metrics", None)
     data_collator = kwargs.pop("data_collator", None)
     optimizers = kwargs.pop("optimizers", (None, None))
     args = TrainingArguments("./regression", **kwargs)
     return Trainer(
         model,
         args,
         data_collator=data_collator,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
         compute_metrics=compute_metrics,
         optimizers=optimizers,
     )
Пример #7
0
    def test_trainer_eval_mrpc(self):
        MODEL_ID = "bert-base-cased-finetuned-mrpc"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
        data_args = GlueDataTrainingArguments(
            task_name="mrpc",
            data_dir="./tests/fixtures/tests_samples/MRPC",
            overwrite_cache=True)
        eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")

        training_args = TrainingArguments(output_dir="./examples",
                                          no_cuda=True)
        trainer = Trainer(model=model,
                          args=training_args,
                          eval_dataset=eval_dataset)
        result = trainer.evaluate()
        self.assertLess(result["eval_loss"], 0.2)
Пример #8
0
    def train(self,
              num_train_epochs=10,
              learning_rate=1e-5,
              weight_decay=1e-2,
              per_device_train_batch_size=16,
              per_device_eval_batch_size=16):
        def compute_metrics(p):
            predictions, labels = p
            true_labels, true_predictions = self.process_pred_labels(
                predictions, labels)
            results = self.metric.compute(predictions=true_predictions,
                                          references=true_labels)
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }

        if self.pretrained:
            tokenized_datasets = self.datasets.map(
                self.tokenize_and_align_labels, batched=True)
            train_args = TrainingArguments(
                "{}/exp".format(self.classifier_dir),
                evaluation_strategy="epoch",
                learning_rate=learning_rate,
                per_device_train_batch_size=per_device_train_batch_size,
                per_device_eval_batch_size=per_device_eval_batch_size,
                num_train_epochs=num_train_epochs,
                weight_decay=weight_decay,
                load_best_model_at_end=True)

            trainer = Trainer(self.model,
                              train_args,
                              train_dataset=tokenized_datasets["train"],
                              eval_dataset=tokenized_datasets["validation"],
                              data_collator=self.data_collator,
                              tokenizer=self.tokenizer,
                              compute_metrics=compute_metrics)
            # fine tuning model
            trainer.train()
            trainer.save_model(self.classifier_dir)
            print("Trainer is saved to ", self.classifier_dir)
        else:
            print("Classifier is disabled. No need to train!")
Пример #9
0
def train_model():
    set_seed(1)

    model_name = 'bert-base-uncased'
    max_length = 512

    tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

    (train_x, test_x, train_y, test_y), label_names = preprocess_dataset()

    train_encodings = tokenizer(train_x, truncation=True, padding=True, max_length=max_length)
    test_encodings = tokenizer(test_x, truncation=True, padding=True, max_length=max_length)

    train_dataset = TorchDataset(train_encodings, train_y)
    test_dataset = TorchDataset(test_encodings, test_y)

    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_names))#.to('cuda')

    training_arguments = TrainingArguments(
        output_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'results'),
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=20,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'logs'),
        load_best_model_at_end=True,
        logging_steps=200,
        evaluation_strategy='steps'
    )

    trainer = Trainer(
        model=model,
        args=training_arguments,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.evaluate()

    model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'political_tweets_bert-base-uncased_3')
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
Пример #10
0
def main():
    args = parse_arguments()
    if args.input_model is None:
        model = GPT2LMHeadModel.from_pretrained("antoiloui/belgpt2")
    else:
        print('loading pre trained model')
        model = GPT2LMHeadModel.from_pretrained(args.input_model)

    tokenizer = GPT2Tokenizer.from_pretrained("antoiloui/belgpt2")

    training_args = TrainingArguments(
        output_dir=args.output_dir + '_checkpoint',  # output directory
        num_train_epochs=3,  # total number of training epochs
        per_device_train_batch_size=64,  # batch size per device during training
        warmup_steps=100,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs_hyca',  # directory for storing logs
        logging_steps=100,
    )
    special_tokens_dict = {
        'bos_token': '<BOS>',
        'eos_token': '<EOS>',
        'pad_token': '<PAD>'
    }
    tokenizer.add_special_tokens(special_tokens_dict)

    model.resize_token_embeddings(len(tokenizer))
    dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                    file_path=args.input_file,
                                    block_size=32)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )
    if args.input_model is not None:
        trainer.train(resume_from_checkpoint=args.input_model + '_checkpoint')
    else:
        trainer.train()
    model.save_pretrained(args.output_dir)
def train(X_train, y_train, y_column_name, model_name=None):
    eval_dataset = y_train[y_column_name]

    model_args = ModelArguments(model_name_or_path="distilbert-base-cased", )
    global data_args
    data_args = DataTrainingArguments(task_name="mnli",
                                      data_dir="../../datasets/Newswire")
    num_labels = glue_tasks_num_labels[data_args.task_name]
    training_args = TrainingArguments(
        output_dir=model_name,
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_gpu_train_batch_size=32,
        per_gpu_eval_batch_size=128,
        num_train_epochs=1,
        logging_steps=500,
        logging_first_step=True,
        save_steps=1000,
        evaluate_during_training=True,
    )

    config = AutoConfig.from_pretrained(
        model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        config=config,
    )

    train_dataset = GlueDataset(data_args,
                                tokenizer=tokenizer,
                                limit_length=100_000)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )
    trainer.train()
Пример #12
0
def get_trainer(train_dataset, collator, model):
    training_args = TrainingArguments(
        output_dir=f'output/bash',
        overwrite_output_dir=True,
        do_train=True,
        no_cuda=cfg('device') == 'cpu',
        num_train_epochs=cfg('epochs'),
        per_device_train_batch_size=cfg('batch_size'),
        gradient_accumulation_steps=cfg('grad_acc'),
        logging_steps=5,
        save_steps=0,
        seed=random.randint(0, 2**32 - 1))
    trainer = MTrainer(model=model,
                       args=training_args,
                       data_collator=collator,
                       train_dataset=train_dataset,
                       prediction_loss_only=True)
    return trainer
Пример #13
0
 def get_regression_trainer(a=0, b=0, double_output=False, train_len=64, eval_len=64, **kwargs):
     label_names = kwargs.get("label_names", None)
     train_dataset = RegressionDataset(length=train_len, label_names=label_names)
     eval_dataset = RegressionDataset(length=eval_len, label_names=label_names)
     model = RegressionModel(a, b, double_output)
     compute_metrics = kwargs.pop("compute_metrics", None)
     data_collator = kwargs.pop("data_collator", None)
     optimizers = kwargs.pop("optimizers", (None, None))
     args = TrainingArguments("./regression", **kwargs)
     return Trainer(
         model,
         args,
         data_collator=data_collator,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
         compute_metrics=compute_metrics,
         optimizers=optimizers,
     )
Пример #14
0
def get_train_args(lr=1e-4):
    train_root_path = Path('experiments/transformers/bert') / bert_model_size_type / tokenizer_type
    p = train_root_path / f'bert-{bert_model_size_type}-{tokenizer_type}-{data_source_name}-{vocab_size}-05-64'
    # p = train_root_path / f'bert-{bert_model_size_type}-{tokenizer_type}-{data_source_name}-{vocab_size}-05-128'
    # p = train_root_path / f'bert-{bert_model_size_type}-{tokenizer_type}-{data_source_name}-{vocab_size}-05'
    p.mkdir(parents=True, exist_ok=True)
    return TrainingArguments(
        output_dir=str(p),
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=48,
        gradient_accumulation_steps=5,
        save_total_limit=0,
        save_steps=0,
        learning_rate=lr,
        # fp16=True,
        dataloader_num_workers=8
    )
Пример #15
0
def train_from_feedback(epochs=1):
    train_data = load_feedback()
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=epochs,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    trainer = Trainer(model=model,
                      args=training_args,
                      compute_metrics=compute_metrics,
                      train_dataset=train_data)

    trainer.train()
    model.save_pretrained(SAVE_LOCATION)
    return "MODEL UPDATED"
Пример #16
0
    def train_no_evaluate(self) -> None:
        """Train a BERT-based model, using the training set to train.
        """
        assert self.train_dataset is not None, "train_file was not provided!"

        self.trainer = Trainer(
            model=self.model,
            args=TrainingArguments(
                do_train=True,
                output_dir=self.output_dir,
                overwrite_output_dir=True,
                num_train_epochs=self.num_train_epochs,
            ),
            train_dataset=self.train_dataset,
        )
        self.trainer.train(model_path=self.model_path)
        self.trainer.save_model()
        self.tokenizer.save_pretrained(self.trainer.args.output_dir)
Пример #17
0
    def test_custom_optimizer(self):
        train_dataset = RegressionDataset()
        args = TrainingArguments("./regression")
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer, lr_lambda=lambda x: 1.0)
        trainer = Trainer(model,
                          args,
                          train_dataset=train_dataset,
                          optimizers=(optimizer, lr_scheduler))
        trainer.train()

        (a, b) = self.default_trained_model
        self.assertFalse(torch.allclose(trainer.model.a, a))
        self.assertFalse(torch.allclose(trainer.model.b, b))
        self.assertEqual(
            trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
Пример #18
0
    def test_parallel_training(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelWithHeads.from_config(self.config())

        model.add_adapter("mrpc1")
        model.add_adapter("mrpc2")
        self.add_head(model, "mrpc1", num_labels=2)
        self.add_head(model, "mrpc2", num_labels=3)
        model.active_adapters = Parallel("mrpc1", "mrpc2")
        model.train_adapter(Parallel("mrpc1", "mrpc2"))
        # model.eval()

        # all weights of the adapter should be activated
        for k, v in filter_parameters(model, "adapters.mrpc1.").items():
            self.assertTrue(v.requires_grad, k)
        # all weights of the adapter not used for training should be freezed
        for k, v in filter_parameters(model, "adapters.mrpc2.").items():
            self.assertTrue(v.requires_grad, k)
        # weights of the model should be freezed (check on some examples)
        for k, v in filter_parameters(model, "encoder.layer.0.attention").items():
            self.assertFalse(v.requires_grad, k)

        state_dict_pre = copy.deepcopy(model.state_dict())

        train_dataset = self.dataset(tokenizer)
        training_args = TrainingArguments(
            output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=10, no_cuda=True
        )

        # evaluate
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
        )
        trainer.train()

        for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()):
            if "mrpc" in k1:
                self.assertFalse(torch.equal(v1, v2), k1)
            else:
                self.assertTrue(torch.equal(v1, v2))
Пример #19
0
def finetune(tag):
    """fine-tune gpt2 on the given caption dataset"""
    global tokenizer
    config = AutoConfig.from_pretrained('gpt2')
    model = AutoModelWithLMHead.from_pretrained('gpt2', config=config)
    block_size = tokenizer.max_len
    # https://github.com/huggingface/transformers/blob/448c467256332e4be8c122a159b482c1ef039b98/src/transformers/data/datasets/language_modeling.py
    try:
        train_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=f'./text/training_text/{tag}.txt',
            block_size=block_size,
            overwrite_cache=True)
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=False)
        epochs = 8
        training_args = TrainingArguments(output_dir='logging/output',
                                          overwrite_output_dir=True,
                                          do_train=True,
                                          num_train_epochs=epochs,
                                          gradient_accumulation_steps=1,
                                          learning_rate=1e-4,
                                          per_gpu_train_batch_size=1,
                                          logging_steps=50,
                                          save_steps=0)
        set_seed(training_args.seed)
        trainer = Trainer(model=model,
                          args=training_args,
                          data_collator=data_collator,
                          train_dataset=train_dataset,
                          prediction_loss_only=True)
        with open(f'./logging/training_stats/training_{tag}.log', 'w') as log:
            sys.stdout = log
            trainer.train()
        sys.stdout = sys.__stdout__
        if not os.path.exists(f'./trained_models/{tag}/'):
            os.makedirs(f'./trained_models/{tag}/')
        # save the model
        model.save_pretrained(f'./trained_models/{tag}/')
        print('Done!')
    except AssertionError:
        print(
            f'The training text with the tag = {tag} does not exist. No model was trained!'
        )
    def trainings_run(self, model, tokenizer):
        # setup dataset
        train_dataset = self.dataset(tokenizer)
        training_args = TrainingArguments(
            output_dir="./examples",
            do_train=True,
            learning_rate=0.1,
            max_steps=10,
            no_cuda=True,
            per_device_train_batch_size=2,
        )

        # evaluate
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
        )
        trainer.train()
Пример #21
0
def model_trainer(args, test_dataset):
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels =4)
    model = RobertaForSequenceClassification.from_pretrained(args.model_path,
                                                             num_labels=3,
                                                             return_dict=True)

    #anfs/bigdisc/rmya2/faiss_data/model_verdict_predictor/checkpoint-1500'
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        per_device_eval_batch_size=32,  # batch size for evaluation
    )

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        eval_dataset=test_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,
    )
    return trainer, model
Пример #22
0
 def test_evaluation_with_keys_to_drop(self):
     config = GPT2Config(vocab_size=100,
                         n_positions=128,
                         n_ctx=128,
                         n_embd=32,
                         n_layer=3,
                         n_head=4)
     tiny_gpt2 = GPT2LMHeadModel(config)
     x = torch.randint(0, 100, (128, ))
     eval_dataset = RepeatDataset(x)
     args = TrainingArguments("./test")
     trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
     # By default the past_key_values are removed
     result = trainer.predict(eval_dataset)
     self.assertTrue(isinstance(result.predictions, np.ndarray))
     # We can still get them by setting ignore_keys to []
     result = trainer.predict(eval_dataset, ignore_keys=[])
     self.assertTrue(isinstance(result.predictions, tuple))
     self.assertEqual(len(result.predictions), 2)
Пример #23
0
def bert(training, testing_1, testing_2, fine_tune):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    testing_data_1 = np.load(testing_1)
    testing_data_2 = np.load(testing_2)
    testing_data = np.concatenate((testing_data_1, testing_data_2))
    training_data = np.load(training)
    model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True, )
    if fine_tune:
        train_data = []
        eval_data = []
        for i in range(len(training_data)):
            if i % 2 == 0 and i < len(training_data)*0.8:
                train_data.append(training_data[i])
            else:
                if i % 2 == 0:
                    eval_data.append(training_data[i])
        inputs = tokenizer(train_data, padding="max_length", truncation=True)
        training_args = TrainingArguments(output_dir=os.getcwd() + "\\data\\", do_eval=False)
        trainer = Trainer(model=model, args=training_args, train_dataset=inputs, eval_dataset=eval_data)
        trainer.train()
    output = []
    model.eval()
    for i in range(len(testing_data)):
        if i%2 == 0:
            sentence = "[CLS] " + testing_data[i] + " [SEP]"
            tokenized_sentence = tokenizer.tokenize(sentence)
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_sentence)
            segments_ids = [1] * len(tokenized_sentence)
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])
            with torch.no_grad():
                outputs = model(tokens_tensor, segments_tensors)
                hidden_states = outputs[2]
                token_embeddings = torch.stack(hidden_states, dim=0)
                token_embeddings = torch.squeeze(token_embeddings, dim=1)
                token_embeddings = token_embeddings.permute(1, 0, 2)
                token_vecs_cat = []
                for token in token_embeddings:
                    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
                    token_vecs_cat.append(cat_vec)
            for i, token_str in enumerate(tokenized_sentence):
                output.append(np.array(token_vecs_cat[i]))
    return output[:len(testing_data_1)], output[len(testing_data_1):]
Пример #24
0
def main(name):
    logging.info("Start of training")

    train_df = pd.read_json("train_processed.json")
    val_df = pd.read_json("val_processed.json")

    unique_tags = set(tag for label in train_df["label"].to_list()
                      for tag in label)
    tag2id = {tag: id for id, tag in enumerate(unique_tags)}
    id2tag = {id: tag for tag, id in tag2id.items()}
    with open(f"tag2id_{name}.json", "w", encoding="utf-8") as f:
        json.dump({"tag2id": tag2id, "id2tag": id2tag}, f)

    model, tokenizer = get_model_and_tokenizer("xlm-roberta-base",
                                               len(unique_tags))

    train_dataset = AddressDataset(train_df, tag2id, tokenizer)
    val_dataset = AddressDataset(val_df, tag2id, tokenizer)

    compute_metrics = ComputeMetrics(id2tag).compute

    training_args = TrainingArguments(output_dir=f'./results_{name}',
                                      save_steps=1000,
                                      num_train_epochs=3,
                                      per_device_train_batch_size=64,
                                      per_device_eval_batch_size=64,
                                      warmup_steps=500,
                                      weight_decay=0.01,
                                      logging_dir=f"./logs_{name}",
                                      logging_steps=10,
                                      evaluation_strategy="steps",
                                      eval_steps=500)

    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=val_dataset,
                      tokenizer=tokenizer,
                      compute_metrics=compute_metrics)

    trainer.train()
    # trainer.evaluate()
    trainer.save_model(f"./model_{name}")
Пример #25
0
def main():
    tokenizer = BertTokenizer.from_pretrained('vocab/bert-base-chinese-vocab.txt')

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path="data/dialogue_lined/multi-sents-further-pretrain/train_test_dialogues.txt",
        block_size=512,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )
    
    training_args = TrainingArguments(
        output_dir="model/multi-sents-test-further-pretrained-bert",
        do_train=True,
        warmup_steps=int(100 * (len(dataset) / 32) * 0.1),
        #warmup_steps=10000,
        overwrite_output_dir=True,
        num_train_epochs=100,
        #max_steps=100000,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        save_steps=1000,
        logging_steps=10,
        weight_decay=0.01
    )

    model = BertForMaskedLM.from_pretrained('bert-base-chinese')
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=True,
    )
    
    trainer.train()
    
    trainer.save_model('model/multi-sents-test-further-pretrained-bert')
    
    return
Пример #26
0
def train_MLM(vocf,outmodel,data_df):
    bs=8
    #tokenizer=BertWordPieceTokenizer(vocf)#input vocab.txt
    ttk=BertTokenizer.from_pretrained(vocf)#input vocab.txt
    fvoc=open(vocf)
    vlen=len(fvoc.readlines())
    fvoc.close()
    config=RobertaConfig(vocab_size=vlen,max_position_embeddings=12,num_attention_heads=12, \
                             num_hidden_layers=6,type_vocab_size=1,hidden_size=768)
    model=RobertaForMaskedLM(config=config)
    model.num_parameters()
    
    dataset=tokDataset(data_df,ttk)
#     Data= DataLoader(dataset, batch_size=bs,shuffle=True,drop_last=False,num_workers=0,collate_fn=collate_fn)
#     data_collator = DataCollatorForLanguageModeling(
#         tokenizer=ttk, mlm=True, mlm_probability=0.15
#     )
   
    data_collator=collate_fn(
        tokenizer=ttk, mlm=True, mlm_probability=0.15
    )
    training_args = TrainingArguments(
            output_dir=outmodel,#embedding model path
            overwrite_output_dir=True,
            num_train_epochs=2,
            per_device_train_batch_size=bs,
            save_steps=10_000,
            save_total_limit=2,
            
        )

    trainer = Trainer(
        model=model,
        args=training_args,
        
        train_dataset=dataset,
        data_collator=data_collator,
        prediction_loss_only=True
    )
    trainer.train()
    trainer.save_model(outmodel)
    print('LM train done: ')
Пример #27
0
    def __init__(self, opts, project_path='./'):
        self.project_path = project_path
        self.model_name = opts.model

        self.training_args = TrainingArguments(
            output_dir='./check_points',                    # output directory
            num_train_epochs=opts.epoch,                    # total number of training epochs
            per_device_train_batch_size=opts.train_bs,      # batch size per device during training
            warmup_steps=opts.warmup_steps,                 # number of warmup steps for learning rate scheduler
            weight_decay=opts.weight_decay,                 # strength of weight decay
            logging_dir='./logs',                           # directory for storing logs
            logging_steps=1000,
            learning_rate=opts.lr,
            evaluation_strategy='no',
            save_steps=1500,
        )        
        print_info('load model')
        self.load_model()
        print_info('load data')
        self.load_data()
Пример #28
0
def train_function(train_dataset, eval_dataset=None, **config):
    model_config = AutoConfig.from_pretrained(model_checkpoint)
    model = AutoModelForCausalLM.from_config(model_config)
    training_args = TrainingArguments(
        f"{model_checkpoint}-wikitext2",
        evaluation_strategy="epoch",
        num_train_epochs=config.get("epochs", 3),
        learning_rate=2e-5,
        weight_decay=0.01,
        disable_tqdm=True,
        no_cuda=True,
        save_strategy=config.get("save_strategy", "no"),
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )
    return trainer
Пример #29
0
    def __init__(self, model, train_dataset=None, eval_dataset=None, **kwargs):
        """Inialization method.

        Args:
            model (PreTrainedModel): Pre-trained model.
            train_dataset (Dataset): Training dataset.
            eval_dataset (Dataset): Evaluation dataset.

        """

        logger.debug('Creating runner ...')

        # Defines the arguments
        args = TrainingArguments(output_dir='./results', logging_dir='./logs', **kwargs)

        # Overrides its parent class with inputted arguments
        super(Runner, self).__init__(model, args, train_dataset=train_dataset,
                                     eval_dataset=eval_dataset, compute_metrics=compute_metrics)

        logger.debug('Runner created.')
Пример #30
0
def load_training_arguments(experiment_name= vocab_dir):
	if not os.path.isdir(experiment_name):os.mkdir(experiment_name)
	training_args = TrainingArguments(
		output_dir=experiment_name,
		group_by_length=True,
		per_device_train_batch_size=30,
		gradient_accumulation_steps=2,
		evaluation_strategy="steps",
		num_train_epochs=100,
		gradient_checkpointing=True,
		fp16=True,
		save_steps=1000,
		eval_steps=1000,
		logging_steps=50,
		learning_rate=3e-4,
		warmup_steps=500,
		save_total_limit=6,
		push_to_hub=False,
	)
	return training_args