Пример #1
0
def get_train_args(lr=1e-4):
    train_root_path = Path('experiments/transformers/bert') / bert_model_size_type / tokenizer_type
    p = train_root_path / f'bert-{bert_model_size_type}-{tokenizer_type}-{data_source_name}-{vocab_size}-05-64'
    # p = train_root_path / f'bert-{bert_model_size_type}-{tokenizer_type}-{data_source_name}-{vocab_size}-05-128'
    # p = train_root_path / f'bert-{bert_model_size_type}-{tokenizer_type}-{data_source_name}-{vocab_size}-05'
    p.mkdir(parents=True, exist_ok=True)
    return TrainingArguments(
        output_dir=str(p),
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=48,
        gradient_accumulation_steps=5,
        save_total_limit=0,
        save_steps=0,
        learning_rate=lr,
        # fp16=True,
        dataloader_num_workers=8
    )
Пример #2
0
    def test_custom_optimizer(self):
        train_dataset = RegressionDataset()
        args = TrainingArguments("./regression")
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer, lr_lambda=lambda x: 1.0)
        trainer = Trainer(model,
                          args,
                          train_dataset=train_dataset,
                          optimizers=(optimizer, lr_scheduler))
        trainer.train()

        (a, b) = self.default_trained_model
        self.assertFalse(torch.allclose(trainer.model.a, a))
        self.assertFalse(torch.allclose(trainer.model.b, b))
        self.assertEqual(
            trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
Пример #3
0
    def train_no_evaluate(self) -> None:
        """Train a BERT-based model, using the training set to train.
        """
        assert self.train_dataset is not None, "train_file was not provided!"

        self.trainer = Trainer(
            model=self.model,
            args=TrainingArguments(
                do_train=True,
                output_dir=self.output_dir,
                overwrite_output_dir=True,
                num_train_epochs=self.num_train_epochs,
            ),
            train_dataset=self.train_dataset,
        )
        self.trainer.train(model_path=self.model_path)
        self.trainer.save_model()
        self.tokenizer.save_pretrained(self.trainer.args.output_dir)
Пример #4
0
def finetune(tag):
    """fine-tune gpt2 on the given caption dataset"""
    global tokenizer
    config = AutoConfig.from_pretrained('gpt2')
    model = AutoModelWithLMHead.from_pretrained('gpt2', config=config)
    block_size = tokenizer.max_len
    # https://github.com/huggingface/transformers/blob/448c467256332e4be8c122a159b482c1ef039b98/src/transformers/data/datasets/language_modeling.py
    try:
        train_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=f'./text/training_text/{tag}.txt',
            block_size=block_size,
            overwrite_cache=True)
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=False)
        epochs = 8
        training_args = TrainingArguments(output_dir='logging/output',
                                          overwrite_output_dir=True,
                                          do_train=True,
                                          num_train_epochs=epochs,
                                          gradient_accumulation_steps=1,
                                          learning_rate=1e-4,
                                          per_gpu_train_batch_size=1,
                                          logging_steps=50,
                                          save_steps=0)
        set_seed(training_args.seed)
        trainer = Trainer(model=model,
                          args=training_args,
                          data_collator=data_collator,
                          train_dataset=train_dataset,
                          prediction_loss_only=True)
        with open(f'./logging/training_stats/training_{tag}.log', 'w') as log:
            sys.stdout = log
            trainer.train()
        sys.stdout = sys.__stdout__
        if not os.path.exists(f'./trained_models/{tag}/'):
            os.makedirs(f'./trained_models/{tag}/')
        # save the model
        model.save_pretrained(f'./trained_models/{tag}/')
        print('Done!')
    except AssertionError:
        print(
            f'The training text with the tag = {tag} does not exist. No model was trained!'
        )
Пример #5
0
    def test_parallel_training(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelWithHeads.from_config(self.config())

        model.add_adapter("mrpc1")
        model.add_adapter("mrpc2")
        self.add_head(model, "mrpc1", num_labels=2)
        self.add_head(model, "mrpc2", num_labels=3)
        model.active_adapters = Parallel("mrpc1", "mrpc2")
        model.train_adapter(Parallel("mrpc1", "mrpc2"))
        # model.eval()

        # all weights of the adapter should be activated
        for k, v in filter_parameters(model, "adapters.mrpc1.").items():
            self.assertTrue(v.requires_grad, k)
        # all weights of the adapter not used for training should be freezed
        for k, v in filter_parameters(model, "adapters.mrpc2.").items():
            self.assertTrue(v.requires_grad, k)
        # weights of the model should be freezed (check on some examples)
        for k, v in filter_parameters(model, "encoder.layer.0.attention").items():
            self.assertFalse(v.requires_grad, k)

        state_dict_pre = copy.deepcopy(model.state_dict())

        train_dataset = self.dataset(tokenizer)
        training_args = TrainingArguments(
            output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=10, no_cuda=True
        )

        # evaluate
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
        )
        trainer.train()

        for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()):
            if "mrpc" in k1:
                self.assertFalse(torch.equal(v1, v2), k1)
            else:
                self.assertTrue(torch.equal(v1, v2))
Пример #6
0
def model_trainer(args, test_dataset):
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels =4)
    model = RobertaForSequenceClassification.from_pretrained(args.model_path,
                                                             num_labels=3,
                                                             return_dict=True)

    #anfs/bigdisc/rmya2/faiss_data/model_verdict_predictor/checkpoint-1500'
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        per_device_eval_batch_size=32,  # batch size for evaluation
    )

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        eval_dataset=test_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,
    )
    return trainer, model
Пример #7
0
    def test_trainer_eval_mrpc(self):
        MODEL_ID = "bert-base-cased-finetuned-mrpc"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
        data_args = GlueDataTrainingArguments(
            task_name="mrpc",
            data_dir="./examples/tests_samples/MRPC",
            overwrite_cache=True)
        eval_dataset = GlueDataset(data_args,
                                   tokenizer=tokenizer,
                                   evaluate=True)

        training_args = TrainingArguments(output_dir="./examples",
                                          no_cuda=True)
        trainer = Trainer(model=model,
                          args=training_args,
                          eval_dataset=eval_dataset)
        result = trainer.evaluate()
        self.assertLess(result["loss"], 0.2)
Пример #8
0
def main(name):
    logging.info("Start of training")

    train_df = pd.read_json("train_processed.json")
    val_df = pd.read_json("val_processed.json")

    unique_tags = set(tag for label in train_df["label"].to_list()
                      for tag in label)
    tag2id = {tag: id for id, tag in enumerate(unique_tags)}
    id2tag = {id: tag for tag, id in tag2id.items()}
    with open(f"tag2id_{name}.json", "w", encoding="utf-8") as f:
        json.dump({"tag2id": tag2id, "id2tag": id2tag}, f)

    model, tokenizer = get_model_and_tokenizer("xlm-roberta-base",
                                               len(unique_tags))

    train_dataset = AddressDataset(train_df, tag2id, tokenizer)
    val_dataset = AddressDataset(val_df, tag2id, tokenizer)

    compute_metrics = ComputeMetrics(id2tag).compute

    training_args = TrainingArguments(output_dir=f'./results_{name}',
                                      save_steps=1000,
                                      num_train_epochs=3,
                                      per_device_train_batch_size=64,
                                      per_device_eval_batch_size=64,
                                      warmup_steps=500,
                                      weight_decay=0.01,
                                      logging_dir=f"./logs_{name}",
                                      logging_steps=10,
                                      evaluation_strategy="steps",
                                      eval_steps=500)

    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=val_dataset,
                      tokenizer=tokenizer,
                      compute_metrics=compute_metrics)

    trainer.train()
    # trainer.evaluate()
    trainer.save_model(f"./model_{name}")
Пример #9
0
def bert(training, testing_1, testing_2, fine_tune):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    testing_data_1 = np.load(testing_1)
    testing_data_2 = np.load(testing_2)
    testing_data = np.concatenate((testing_data_1, testing_data_2))
    training_data = np.load(training)
    model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True, )
    if fine_tune:
        train_data = []
        eval_data = []
        for i in range(len(training_data)):
            if i % 2 == 0 and i < len(training_data)*0.8:
                train_data.append(training_data[i])
            else:
                if i % 2 == 0:
                    eval_data.append(training_data[i])
        inputs = tokenizer(train_data, padding="max_length", truncation=True)
        training_args = TrainingArguments(output_dir=os.getcwd() + "\\data\\", do_eval=False)
        trainer = Trainer(model=model, args=training_args, train_dataset=inputs, eval_dataset=eval_data)
        trainer.train()
    output = []
    model.eval()
    for i in range(len(testing_data)):
        if i%2 == 0:
            sentence = "[CLS] " + testing_data[i] + " [SEP]"
            tokenized_sentence = tokenizer.tokenize(sentence)
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_sentence)
            segments_ids = [1] * len(tokenized_sentence)
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])
            with torch.no_grad():
                outputs = model(tokens_tensor, segments_tensors)
                hidden_states = outputs[2]
                token_embeddings = torch.stack(hidden_states, dim=0)
                token_embeddings = torch.squeeze(token_embeddings, dim=1)
                token_embeddings = token_embeddings.permute(1, 0, 2)
                token_vecs_cat = []
                for token in token_embeddings:
                    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
                    token_vecs_cat.append(cat_vec)
            for i, token_str in enumerate(tokenized_sentence):
                output.append(np.array(token_vecs_cat[i]))
    return output[:len(testing_data_1)], output[len(testing_data_1):]
Пример #10
0
 def test_evaluation_with_keys_to_drop(self):
     config = GPT2Config(vocab_size=100,
                         n_positions=128,
                         n_ctx=128,
                         n_embd=32,
                         n_layer=3,
                         n_head=4)
     tiny_gpt2 = GPT2LMHeadModel(config)
     x = torch.randint(0, 100, (128, ))
     eval_dataset = RepeatDataset(x)
     args = TrainingArguments("./test")
     trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
     # By default the past_key_values are removed
     result = trainer.predict(eval_dataset)
     self.assertTrue(isinstance(result.predictions, np.ndarray))
     # We can still get them by setting ignore_keys to []
     result = trainer.predict(eval_dataset, ignore_keys=[])
     self.assertTrue(isinstance(result.predictions, tuple))
     self.assertEqual(len(result.predictions), 2)
    def trainings_run(self, model, tokenizer):
        # setup dataset
        train_dataset = self.dataset(tokenizer)
        training_args = TrainingArguments(
            output_dir="./examples",
            do_train=True,
            learning_rate=0.1,
            max_steps=10,
            no_cuda=True,
            per_device_train_batch_size=2,
        )

        # evaluate
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
        )
        trainer.train()
Пример #12
0
def main():
    tokenizer = BertTokenizer.from_pretrained('vocab/bert-base-chinese-vocab.txt')

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path="data/dialogue_lined/multi-sents-further-pretrain/train_test_dialogues.txt",
        block_size=512,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )
    
    training_args = TrainingArguments(
        output_dir="model/multi-sents-test-further-pretrained-bert",
        do_train=True,
        warmup_steps=int(100 * (len(dataset) / 32) * 0.1),
        #warmup_steps=10000,
        overwrite_output_dir=True,
        num_train_epochs=100,
        #max_steps=100000,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        save_steps=1000,
        logging_steps=10,
        weight_decay=0.01
    )

    model = BertForMaskedLM.from_pretrained('bert-base-chinese')
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=True,
    )
    
    trainer.train()
    
    trainer.save_model('model/multi-sents-test-further-pretrained-bert')
    
    return
Пример #13
0
def train_MLM(vocf,outmodel,data_df):
    bs=8
    #tokenizer=BertWordPieceTokenizer(vocf)#input vocab.txt
    ttk=BertTokenizer.from_pretrained(vocf)#input vocab.txt
    fvoc=open(vocf)
    vlen=len(fvoc.readlines())
    fvoc.close()
    config=RobertaConfig(vocab_size=vlen,max_position_embeddings=12,num_attention_heads=12, \
                             num_hidden_layers=6,type_vocab_size=1,hidden_size=768)
    model=RobertaForMaskedLM(config=config)
    model.num_parameters()
    
    dataset=tokDataset(data_df,ttk)
#     Data= DataLoader(dataset, batch_size=bs,shuffle=True,drop_last=False,num_workers=0,collate_fn=collate_fn)
#     data_collator = DataCollatorForLanguageModeling(
#         tokenizer=ttk, mlm=True, mlm_probability=0.15
#     )
   
    data_collator=collate_fn(
        tokenizer=ttk, mlm=True, mlm_probability=0.15
    )
    training_args = TrainingArguments(
            output_dir=outmodel,#embedding model path
            overwrite_output_dir=True,
            num_train_epochs=2,
            per_device_train_batch_size=bs,
            save_steps=10_000,
            save_total_limit=2,
            
        )

    trainer = Trainer(
        model=model,
        args=training_args,
        
        train_dataset=dataset,
        data_collator=data_collator,
        prediction_loss_only=True
    )
    trainer.train()
    trainer.save_model(outmodel)
    print('LM train done: ')
Пример #14
0
    def __init__(self, opts, project_path='./'):
        self.project_path = project_path
        self.model_name = opts.model

        self.training_args = TrainingArguments(
            output_dir='./check_points',                    # output directory
            num_train_epochs=opts.epoch,                    # total number of training epochs
            per_device_train_batch_size=opts.train_bs,      # batch size per device during training
            warmup_steps=opts.warmup_steps,                 # number of warmup steps for learning rate scheduler
            weight_decay=opts.weight_decay,                 # strength of weight decay
            logging_dir='./logs',                           # directory for storing logs
            logging_steps=1000,
            learning_rate=opts.lr,
            evaluation_strategy='no',
            save_steps=1500,
        )        
        print_info('load model')
        self.load_model()
        print_info('load data')
        self.load_data()
Пример #15
0
    def __init__(self, model, train_dataset=None, eval_dataset=None, **kwargs):
        """Inialization method.

        Args:
            model (PreTrainedModel): Pre-trained model.
            train_dataset (Dataset): Training dataset.
            eval_dataset (Dataset): Evaluation dataset.

        """

        logger.debug('Creating runner ...')

        # Defines the arguments
        args = TrainingArguments(output_dir='./results', logging_dir='./logs', **kwargs)

        # Overrides its parent class with inputted arguments
        super(Runner, self).__init__(model, args, train_dataset=train_dataset,
                                     eval_dataset=eval_dataset, compute_metrics=compute_metrics)

        logger.debug('Runner created.')
Пример #16
0
def train_function(train_dataset, eval_dataset=None, **config):
    model_config = AutoConfig.from_pretrained(model_checkpoint)
    model = AutoModelForCausalLM.from_config(model_config)
    training_args = TrainingArguments(
        f"{model_checkpoint}-wikitext2",
        evaluation_strategy="epoch",
        num_train_epochs=config.get("epochs", 3),
        learning_rate=2e-5,
        weight_decay=0.01,
        disable_tqdm=True,
        no_cuda=True,
        save_strategy=config.get("save_strategy", "no"),
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )
    return trainer
Пример #17
0
def load_training_arguments(experiment_name= vocab_dir):
	if not os.path.isdir(experiment_name):os.mkdir(experiment_name)
	training_args = TrainingArguments(
		output_dir=experiment_name,
		group_by_length=True,
		per_device_train_batch_size=30,
		gradient_accumulation_steps=2,
		evaluation_strategy="steps",
		num_train_epochs=100,
		gradient_checkpointing=True,
		fp16=True,
		save_steps=1000,
		eval_steps=1000,
		logging_steps=50,
		learning_rate=3e-4,
		warmup_steps=500,
		save_total_limit=6,
		push_to_hub=False,
	)
	return training_args
Пример #18
0
    def rl_train_step(self, sess, x, rewards, baseline, offpolicy,
                      decay_weight, learn_rate):
        """
        These tensors are defined in `__init__`.
        choice_a = ratio * (self.rewards - accumlated_pred * self.decay_weight - self.baseline)
        choice_b = clipped_ratio * (self.rewards - accumlated_pred * self.decay_weight - self.baseline)
        self.g_loss = - tf.reduce_mean(tf.minimum(choice_a, choice_b))
        g_opt = self.optimizer(self.learning_rate)
        self.g_grad, _ = tf.clip_by_global_norm(tf.gradients(self.g_loss, self.g_params), self.grad_clip)
        self.g_updates = g_opt.apply_gradients(zip(self.g_grad, self.g_params))

        Take one step of optimization. Can be done via:
        https://keras.io/guides/writing_a_training_loop_from_scratch/
        """
        # outputs = sess.run(
        #     [self.g_updates, self.g_loss],
        #     feed_dict={
        #         self.x: x,
        #         self.rewards: rewards,
        #         self.baseline: baseline,
        #         self.off_policy_prob: offpolicy,
        #         self.decay_weight: decay_weight,
        #         self.learning_rate: learn_rate,
        #     },
        # )
        # return outputs

        # TODO: Fix this. Just a general outline
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            overwrite_output_dir=True,
            num_train_epochs=1,
            per_device_train_batch_size=self.batch_size,
            learning_rate=learn_rate,
            weight_decay=decay_weight)
        trainer = Trainer(model=self.network,
                          args=training_args,
                          train_dataset=x,
                          eval_dataset=baseline)
        trainer.train()
        trainer.save_model()
Пример #19
0
    def __init__(self, model_name_or_path, labels, fine_tuned_model=None):
        self.model_args = ModelArguments(
            model_name_or_path='bert-base-multilingual-cased')
        self.data_args = DataTrainingArguments(data_dir="ner/datadir/",
                                               labels=labels)
        self.training_args = TrainingArguments(output_dir="testing-model",
                                               num_train_epochs=3,
                                               per_gpu_eval_batch_size=32,
                                               save_steps=750,
                                               seed=1)
        self.labels = get_labels(self.data_args.labels)
        self.label_map: Dict[int, str] = {
            i: label
            for i, label in enumerate(self.labels)
        }
        num_labels = len(self.labels)

        # Load pretrained model and tokenizer. Most pretrained models are provided by Google.
        # The function .from_pretraining provided by HuggingFace guarantee only one process downloads the model and vocabulary

        self.config = AutoConfig.from_pretrained(
            self.model_args.config_name if self.model_args.config_name else
            self.model_args.model_name_or_path,
            num_labels=num_labels,
            id2label=self.label_map,
            label2id={label: i
                      for i, label in enumerate(self.labels)},
            cache_dir=self.model_args.cache_dir,
        )
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_args.tokenizer_name if self.model_args.tokenizer_name
            else self.model_args.model_name_or_path,
            cache_dir=self.model_args.cache_dir,
            use_fase=self.model_args.use_fast,
        )
        self.model = AutoModelForTokenClassification.from_pretrained(
            fine_tuned_model,
            from_tf=bool(".ckpt" in self.model_args.model_name_or_path),
            config=self.config,
            cache_dir=self.model_args.cache_dir,
        )
def main(train_json_path, val_json_path, model_name_or_dir, output_dir,
         logging_dir, logging_steps, batch_size, gradient_accumulation_steps,
         learning_rate, num_train_epochs, warmup_ratio, num_classes):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir)
    model = XLMRobertaForTokenClassification.from_pretrained(
        model_name_or_dir, num_labels=num_classes)

    sh_ner_train_dataset, sh_ner_val_dataset = create_sh_ner_dataset(
        train_json_path, val_json_path, tokenizer)

    data_collator = DataCollatorForTokenClassification(tokenizer)

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        do_predict=False,
        evaluation_strategy='epoch',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        logging_dir=logging_dir,
        logging_strategy='steps',
        logging_steps=logging_steps,
        save_strategy='epoch',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=sh_ner_train_dataset,
        eval_dataset=sh_ner_val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
Пример #21
0
def main(args):

    # Import the custom trained tokenizer
    tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer)

    # Define the model
    config = RobertaConfig(vocab_size=32000)
    model = RobertaForMaskedLM(config=config)

    # Import the dataset
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=args.data,
        block_size=128,
    )

    # Initialize the data collector
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

    # Set all of the training arguments
    training_args = TrainingArguments(
        output_dir=args.output,
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_gpu_train_batch_size=24,
        save_steps=10_000,
        save_total_limit=10,
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()

    # Save the mode
    trainer.save_model("./roBERTaCODE_{}_{}".format(args.language, args.size))
Пример #22
0
def retrain_model(model, dataset):

    # Use global config, possibly modified by cmdline args
    global freeze_bert_weights, training_epochs, weight_decay

    # Training Arguments
    # Default parameters are selected from official trainer API's example for fine-tuning:
    # https://huggingface.co/transformers/training.html#trainer (Accessed 16 Dec 2020)
    training_args = TrainingArguments(
        output_dir=
        'generated',  # intermediate outputs (such as training checkpoints) directory
        num_train_epochs=training_epochs,  # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=50,  # number of warmup steps for learning rate scheduler
        weight_decay=weight_decay,  # strength of weight decay
        logging_dir='logs',  # directory for storing logs
        logging_steps=10,
    )

    # Freeze initial layers before training
    # https://github.com/huggingface/transformers/issues/400#issuecomment-477110548 (Accessed 16 Dec 2020)
    if freeze_bert_weights:
        for param in model.bert.parameters():
            param.requires_grad = False

    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=dataset['Train'],
                      eval_dataset=dataset['Validation'])

    # Refine the model on training set
    trainer.train()

    # Optionally, evaluate the refined model on validation set
    trainer.evaluate()

    # TODO: Training curve and evaluation results to be plotted

    # Return updated/refined model
    return trainer.model
Пример #23
0
def model_trainer(args, test_dataset):
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels =4)
    model = RobertaForTokenClassification.from_pretrained(args.model_path,
                                                          num_labels=3,
                                                          return_dict=True)

    #/anfs/bigdisc/rmya2/faiss_data/results_table_to_cell2/checkpoint-1400/'
    training_args = TrainingArguments(
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=16,  # batch size for evaluation
        # warmup_steps=0,                # number of warmup steps for learning rate scheduler
        logging_dir='./logs',
        output_dir='./model_output')

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        eval_dataset=test_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,
    )
    return trainer, model
Пример #24
0
    def test_data_is_not_parallelized_when_model_is_parallel(self):
        model = RegressionModel()
        # Make the Trainer believe it's a parallelized model
        model.is_parallelizable = True
        model.model_parallel = True
        args = TrainingArguments("./regression",
                                 per_device_train_batch_size=16,
                                 per_device_eval_batch_size=16)
        trainer = Trainer(model,
                          args,
                          train_dataset=RegressionDataset(),
                          eval_dataset=RegressionDataset())
        # Check the Trainer was fooled
        self.assertTrue(trainer.is_model_parallel)
        self.assertEqual(trainer.args.n_gpu, 1)

        # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
        self.assertEqual(trainer.get_train_dataloader().batch_size, 16)
        self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
        self.assertEqual(trainer.get_eval_dataloader().batch_size, 16)
        self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
Пример #25
0
 def __init__(self, dataset_path,working_folder, maximum_input_length, maximum_output_length, epochs=1,batch=1, save_step = -1, logging_step=1000, model_name = 't5-base'):
   self.fields = [('input_text', pa.string()),('target_text', pa.string()),('prefix', pa.string())]
   self.dataset_path = dataset_path
   self.working_folder = working_folder
   self.model_name = model_name
   self.maximum_input_length = maximum_input_length
   self.maximum_output_length = maximum_output_length
   self.epochs = epochs
   self.create_dataset()
   self.load_model()
   self.data_collator = T2TDataCollator()
   self.training_args = TrainingArguments(
                       output_dir= self.working_folder,
                       overwrite_output_dir=True,
                       do_train=True,
                       do_eval =False,
                       num_train_epochs=self.epochs,   
                       per_device_train_batch_size=                batch, 
                       logging_steps=                              logging_step,   
                       save_steps=                                 save_step,
                       )
Пример #26
0
    def setUp(self):
        super().setUp()

        args = TrainingArguments(".")
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

        self.dist_env_1_gpu = dict(
            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
        )

        self.ds_config_file = {}
        self.ds_config_file[ZERO2] = f"{self.test_file_dir_str}/ds_config_zero2.json"
        self.ds_config_file[ZERO3] = f"{self.test_file_dir_str}/ds_config_zero3.json"

        # use self.get_config_dict(stage) to use these to ensure the original is not modified
        self.ds_config_dict = {}
        with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
            self.ds_config_dict[ZERO2] = json.load(f)
        with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
            self.ds_config_dict[ZERO3] = json.load(f)
Пример #27
0
def run(args, lcsts):
    # load train and validation data
    # TODO: using test data to see stuffs working first
    train_dataset, val_dataset = setup_dataset(
        train_data_files=lcsts.train_merged_csv,
        val_data_files=lcsts.val_merged_csv,
        tokenizer=tokenizer)
    # setup model
    model = setup_model(args.model_name, args.num_freeze_decoder_layers,
                        tokenizer)

    # set training arguments - these params are not really tuned,
    # feel free to change
    training_args = TrainingArguments(
        output_dir="./ckpt/",
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        # predict_from_generate=True,
        #num_train_epochs=10,
        evaluate_during_training=True,
        do_train=True,
        do_eval=True,
        logging_steps=20,
        save_steps=100,
        eval_steps=1000,
        overwrite_output_dir=True,
        warmup_steps=40,
        save_total_limit=10,
    )

    # instantiate trainer
    trainer = CustomizeTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )
    # start training
    trainer.train()
Пример #28
0
 def get_regression_trainer(a=0,
                            b=0,
                            double_output=False,
                            train_len=64,
                            eval_len=64,
                            **kwargs):
     train_dataset = RegressionDataset(length=train_len)
     eval_dataset = RegressionDataset(length=eval_len)
     model = RegressionModel(a, b, double_output)
     compute_metrics = kwargs.pop("compute_metrics", None)
     data_collator = kwargs.pop("data_collator", None)
     optimizers = kwargs.pop("optimizers", (None, None))
     args = TrainingArguments("./regression", **kwargs)
     return Trainer(
         model,
         args,
         data_collator=data_collator,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
         compute_metrics=compute_metrics,
         optimizers=optimizers,
     )
    def test_auto_set_save_adapters(self):
        model = BertForSequenceClassification(
            BertConfig(
                hidden_size=32,
                num_hidden_layers=4,
                num_attention_heads=4,
                intermediate_size=37,
            )
        )
        model.add_adapter("adapter1")
        model.add_adapter("adapter2")
        model.add_adapter_fusion(Fuse("adapter1", "adapter2"))
        model.train_adapter_fusion(Fuse("adapter1", "adapter2"))

        training_args = TrainingArguments(
            output_dir="./examples",
        )
        trainer = AdapterTrainer(
            model=model,
            args=training_args,
        )
        self.assertTrue(trainer.train_adapter_fusion)
Пример #30
0
 def setup_train_args(self):
     batch_size = 16
     step_eval = math.ceil(len(self.train_dataset)/2/batch_size)
     outdir_name = f'./results2_{self.model_name}'
     logdir_name = f'./logs2_{self.model_name}'
     if os.path.exists(outdir_name):
         shutil.rmtree(outdir_name)
     if os.path.exists(logdir_name):
         shutil.rmtree(logdir_name)
     return TrainingArguments(
         output_dir=outdir_name,
         save_total_limit=1,
         # save_steps=0,
         num_train_epochs=self.num_epochs,
         # evaluate_during_training=True,
         eval_steps=step_eval,
         per_device_train_batch_size=batch_size,
         per_device_eval_batch_size=64,
         warmup_steps=250,
         weight_decay=0.01,
         logging_dir=logdir_name,
     )