Exemplo n.º 1
0
def get_dataloaders(model, tokenizer, batch_size, train_path, eval_path):
    block_size = 1024
    train_dataset = TextDataset(tokenizer=tokenizer,
                                file_path=train_path,
                                block_size=block_size)
    test_dataset = TextDataset(tokenizer=tokenizer,
                               file_path=eval_path,
                               block_size=block_size)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False,
                                                    mlm_probability=0.15)
    trainloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        collate_fn=data_collator,
        drop_last=False,
        num_workers=0,
    )
    testloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        collate_fn=data_collator,
        drop_last=False,
        num_workers=0,
    )
    return trainloader, testloader
Exemplo n.º 2
0
def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):
    if tokenizer.model_max_length > 1e8:
        val_dataset = TextDataset(tokenizer=tokenizer,
                                  file_path=args.val_datapath,
                                  block_size=512)
        logger.info(
            f'[WARNING] tokenizer.model_max_length > 10^8: {tokenizer.model_max_length} setting the value as 512 instead.'
        )
    else:
        val_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=args.val_datapath,
            block_size=tokenizer.model_max_length
        )  #  The `max_len` attribute has been deprecated

    if eval_only:
        train_dataset = val_dataset
    else:
        logger.info(
            f'Loading and tokenizing training data is usually slow: {args.train_datapath}'
        )
        if tokenizer.model_max_length > 1e8:
            train_dataset = TextDataset(tokenizer=tokenizer,
                                        file_path=args.train_datapath,
                                        block_size=512)
            logger.info(
                f'[WARNING] tokenizer.model_max_length > 10^8: {tokenizer.model_max_length} setting the value as 512 instead.'
            )
        else:
            train_dataset = TextDataset(tokenizer=tokenizer,
                                        file_path=args.train_datapath,
                                        block_size=tokenizer.model_max_length)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        prediction_loss_only=True,
    )

    eval_loss = trainer.evaluate()
    #pdb.set_trace()
    eval_loss = eval_loss['eval_loss']
    logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')

    if not eval_only:
        trainer.train(model_path=model_path)
        trainer.save_model()

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')
Exemplo n.º 3
0
def pretrain_and_evaluate(training_args, dataset_args, model, tokenizer, eval_only):
    """
    # adapted from https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing#scrollTo=N8J-TLhBuaOf
    :param training_args: HF training args object
    :param dataset_args: object storing dataset config, requires train_datapath and val_datapath to be defined
    :param model: transformers.PreTrainedModel
    :param tokenizer: PreTrainedTokenizerBase
    :param eval_only: boolean, True only performs evaluation
    :return:
    """

    val_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=dataset_args.val_datapath,
        block_size=tokenizer.model_max_length,
    )
    if eval_only:
        train_dataset = val_dataset
    else:
        logging.info(
            f"Loading and tokenizing training data is usually slow: {dataset_args.train_datapath}"
        )
        train_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=dataset_args.train_datapath,
            block_size=tokenizer.model_max_length,
        )

    # https://github.com/huggingface/transformers/blob/master/src/transformers/data/data_collator.py
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    # https://huggingface.co/transformers/_modules/transformers/trainer.html
    trainer = Trainer_(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    metrics = trainer.evaluate()
    # eval_loss = metrics["eval_loss"]
    # logging.info(f"Initial eval bpc: {eval_loss / math.log(2)}")
    logging.info(f"Initial metrics: {metrics}")

    if not eval_only:
        # to change if we want to continue training existing models
        # same path as from_checkpoint argument from the builder
        trainer.train(model_path=None)

        trainer.save_model()

        metrics = trainer.evaluate()
        eval_loss = metrics["eval_loss"]
        logging.info(f"Eval bpc after pretraining: {eval_loss / math.log(2)}")
Exemplo n.º 4
0
    def load_dataset(self, block_size=128):

        self.train_dataset = TextDataset(tokenizer=self.tokenizer,
                                         file_path=self.train,
                                         block_size=block_size)
        self.dev_dataset = TextDataset(tokenizer=self.tokenizer,
                                       file_path=self.dev,
                                       block_size=block_size)
        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer, mlm=False)
Exemplo n.º 5
0
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer, file_path=train_path, block_size=128
    )

    test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_path, block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    return train_dataset, test_dataset, data_collator
Exemplo n.º 6
0
    def _dataset(file_path, ref_path=None):
        if args.line_by_line:
            if ref_path is not None:
                if not args.whole_word_mask or not args.mlm:
                    raise ValueError(
                        "You need to set world whole masking and mlm to True for Chinese Whole Word Mask"
                    )
                return LineByLineWithRefDataset(
                    tokenizer=tokenizer,
                    file_path=file_path,
                    block_size=args.block_size,
                    ref_path=ref_path,
                )

            return LineByLineTextDataset(tokenizer=tokenizer,
                                         file_path=file_path,
                                         block_size=args.block_size)
        else:
            return TextDataset(
                tokenizer=tokenizer,
                file_path=file_path,
                block_size=args.block_size,
                overwrite_cache=args.overwrite_cache,
                cache_dir=cache_dir,
            )
def get_dataset(args: DataTrainingArguments,
                tokenizer: PreTrainedTokenizer,
                max_len,
                evaluate=False):

    file_path = args.eval_data_file if evaluate else args.train_data_file

    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer,
                                     file_path=file_path,
                                     block_size=args.block_size)
    elif args.text_dataset:
        return TextDataset(tokenizer=tokenizer,
                           file_path=file_path,
                           block_size=args.block_size,
                           overwrite_cache=args.overwrite_cache)
    else:
        """
        When use common tab separated text dataset, use nlp.data.TSVDataset.
        If you want to use other type of dataset, refer to other class of nlp.data,
        or set DataTrainingArguments.line_by_line or DataTrainingArguments.text_dataset True.
        """
        dataset = nlp.data.TSVDataset(file_path,
                                      field_indices=[1],
                                      num_discard_samples=1)

        return Get_dataset(dataset, 0, tokenizer, max_len, True, False)
Exemplo n.º 8
0
    def test_plm(self):
        tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
        data_collator = DataCollatorForPermutationLanguageModeling(tokenizer)
        # ^ permutation lm

        dataset = LineByLineTextDataset(tokenizer,
                                        file_path=PATH_SAMPLE_TEXT,
                                        block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((31, 112)))
        self.assertEqual(batch["perm_mask"].shape, torch.Size((31, 112, 112)))
        self.assertEqual(batch["target_mapping"].shape,
                         torch.Size((31, 112, 112)))
        self.assertEqual(batch["labels"].shape, torch.Size((31, 112)))

        dataset = TextDataset(tokenizer,
                              file_path=PATH_SAMPLE_TEXT,
                              block_size=512,
                              overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 512, 512)))
        self.assertEqual(batch["target_mapping"].shape,
                         torch.Size((2, 512, 512)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))

        example = [torch.randint(5, [5])]
        with self.assertRaises(ValueError):
            # Expect error due to odd sequence length
            data_collator(example)
Exemplo n.º 9
0
def get_dataset(filepath,
                tokenizer,
                block_size,
                line_by_line=False,
                overwrite_cache=False):
    '''
    Load a dataset from the specified filepath.

    :param filepath:
        The filepath of the dataset.
    :param tokenizer:
        The tokenizer to parse the dataset with.
    :param block_size:
        The length of a single input sequence (block).
    :param line_by_line:
        Indicates whether distinct lines of text in the dataset are to be handled as
        separate sequence (i.e. whether to add the BOS adn EOS tokens to each line).
        Defaults to False.
    :param overwrite_cache:
        Overwrite the cached training and evaluation sets. Defaults to False.
    :returns:
        A :class:`torch.utils.data.Dataset` object.

    '''

    if line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer,
                                     file_path=filepath,
                                     block_size=block_size)
    else:
        return TextDataset(tokenizer=tokenizer,
                           file_path=filepath,
                           block_size=block_size,
                           overwrite_cache=overwrite_cache)
Exemplo n.º 10
0
def main():
    print("PREPROCESSING DATA")
    preprocess()
    print("LOADING TOKENIZER")
    tokenizer = get_tokenizer()
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False)
    print("LOADING MODEL", cfg('model'))
    model = get_model(tokenizer)

    print("LOADING DATA")
    if cfg('encoding') == 'LBL':
        train_dataset = LBLDataset(tokenizer=tokenizer,
                                   file_path=filename('train'))
    elif cfg('encoding') == 'blocked':
        train_dataset = BlockedDataset(tokenizer=tokenizer,
                                       file_path=filename('train'))
    elif cfg('encoding') == 'text':
        train_dataset = TextDataset(tokenizer=tokenizer,
                                    file_path=filename('train'),
                                    block_size=cfg('max_block'))
    elif cfg('encoding').startswith('inter'):
        if cfg('encoding').endswith('LBL'):
            loader = LBLDataset
        elif cfg('encoding').endswith('blocked'):
            loader = BlockedDataset

        d1 = loader(tokenizer=tokenizer, file_path=filename('train'))
        d2 = loader(tokenizer=tokenizer, file_path=filename('dirty'))
        train_dataset = CombinedDataset(d1, d2)
    else:
        raise ValueError("Unkown encoding")

    trainer = get_trainer(train_dataset, data_collator, model)

    def validator(x, y):
        global BEST_metric
        model.save_pretrained(session)
        metric, pred = validate(model, tokenizer, x, y)
        if np.mean(metric) > BEST_metric:
            print("NEW BEST (saving)")
            BEST_metric = np.mean(metric)

        # save predicitions and model
        save(session + "metric.txt", str(metric) + "\n")
        save(session + "pred.txt", str(pred) + "\n\n")
        return metric, pred

    trainer.validator = validator
    trainer.val_dataset = get_validation_data()

    # saving configuration
    print("SAVING...")
    session = get_session_path()
    print(session)
    save(session + "conf.txt", repr(cfg()))

    print("STARTING TRAINING...")
    trainer.train()
Exemplo n.º 11
0
def load_dataset(train_path, test_path, tokenizer):
    """
    Loads training and validation data from text files into a TextDataset.
    """
    train_dataset = TextDataset(tokenizer=tokenizer,
                                file_path=train_path,
                                block_size=128)

    test_dataset = TextDataset(tokenizer=tokenizer,
                               file_path=test_path,
                               block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    return train_dataset, test_dataset, data_collator
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache
        )
Exemplo n.º 13
0
    def pretrain_and_evaluate(args,
                              model,
                              tokenizer,
                              eval_only,
                              model_path=None,
                              block_size=''):
        val_dataset = TextDataset(tokenizer=tokenizer,
                                  file_path=args.val_datapath,
                                  block_size=tokenizer.max_len)
        if eval_only:
            train_dataset = val_dataset
        else:
            logger.info(
                f'Loading and tokenizing training data is usually slow: {args.train_datapath}'
            )
            train_dataset = TextDataset(tokenizer=tokenizer,
                                        file_path=args.train_datapath,
                                        block_size=tokenizer.max_len)

        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=True,
                                                        mlm_probability=0.15)
        trainer = Trainer(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            prediction_loss_only=True,
        )

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')
        logger.info(f'Initial perplexity: {math.exp(eval_loss)}')

        if not eval_only:
            trainer.train(model_path=model_path)
            trainer.save_model()

            eval_loss = trainer.evaluate()
            eval_loss = eval_loss['eval_loss']
            logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')
            logger.info(
                f'Eval perplexity after pretraining: {math.exp(eval_loss)}')
Exemplo n.º 14
0
def get_dataset(file_path,
                tokenizer: PreTrainedTokenizer,
                block_size: int = None) -> TextDataset:
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size if block_size is not None else tokenizer.max_len,
        overwrite_cache=True,
    )
Exemplo n.º 15
0
    def _load(self):
        """Load Tokenizer, Dataset, Model and Jit input example
        """
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint)
        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )
        self.gpt2_dataset = dict(train=TextDataset(tokenizer=self.tokenizer,
                                                   file_path=self.dataset,
                                                   block_size=128),
                                 validation=TextDataset(
                                     tokenizer=self.tokenizer,
                                     file_path=self.dataset,
                                     block_size=128))

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_checkpoint)
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False, local_rank=-1):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank
        )
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank,
        )
Exemplo n.º 17
0
 def _dataset(file_path):
     if args.line_by_line:
         return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
     else:
         return TextDataset(
             tokenizer=tokenizer,
             file_path=file_path,
             block_size=args.block_size,
             overwrite_cache=args.overwrite_cache,
         )
Exemplo n.º 18
0
def pretrain_and_evaluate(training_args, model, tokenizer, eval_only,
                          model_path_out):
    logger.info(f'Loading and tokenizing data is usually slow: {VAL_FPATH}')

    val_dataset = TextDataset(tokenizer=tokenizer,
                              file_path=VAL_FPATH,
                              block_size=tokenizer.max_len)

    if eval_only:
        train_dataset = val_dataset
    else:
        logger.info(
            f'Loading and tokenizing training data is usually slow: {TRAIN_FPATH}'
        )
        train_dataset = TextDataset(tokenizer=tokenizer,
                                    file_path=TRAIN_FPATH,
                                    block_size=tokenizer.max_len)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    logger.warning(f'Model Params set to {training_args}')

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset,
                      eval_dataset=val_dataset,
                      prediction_loss_only=True)

    eval_loss = trainer.evaluate()
    eval_loss = eval_loss['eval_loss']
    logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')

    if not eval_only:
        trainer.train(model_path=model_path_out)
        trainer.save_model()

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')
Exemplo n.º 19
0
def _load_dataset():

    train_dataset = TextDataset(tokenizer=tokenizer,
                                file_path=cfg.DATA,
                                block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    return train_dataset, data_collator
Exemplo n.º 20
0
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer,model_args:ModelArguments, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        if args.mlm_sample_times > 1:
            return FullyLineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, cache_dir=model_args.cache_dir)
        else:
            return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, cache_dir=model_args.cache_dir)
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache
        )
Exemplo n.º 21
0
def load_dataset(path, tokenizer, seq_len):

    print('loading dataset')
    dataset = TextDataset(tokenizer=tokenizer,
                          file_path=path,
                          block_size=seq_len)

    print('dataset loaded')

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    print('data collator ready')

    return dataset, data_collator
Exemplo n.º 22
0
def get_dataset(
    args: DataTrainingArguments,
    tokenizer: PreTrainedTokenizer,
    evaluate: bool = False,
    cache_dir: Optional[str] = None,
):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
            cache_dir=cache_dir,
        )
Exemplo n.º 23
0
def get_dataset(
    args: DataTrainingArguments,
    tokenizer: PreTrainedTokenizer,
    evaluate: bool = False,
    cache_dir: Optional[str] = None,
):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        print("entering get_dataset function with line_by_line")
        import glob
        if evaluate:
            files = glob.glob(file_path + 'shard_*')
        else:
            files = glob.glob(file_path + 'shard_*')
        from datasets import load_dataset
        dataset = load_dataset('text', data_files=files)

        #(On passing the whole dataset file (11GB) directly to load_dataset was resulting into RAM issue)

        #Tokenization
        def encode(examples):
            #return tokenizer(examples['text'], truncation=True, padding='max_length')
            return tokenizer(examples['text'],
                             add_special_tokens=True,
                             truncation=True,
                             max_length=args.block_size)

        dataset = dataset.map(encode, batched=True)
        #print("flag1: ", dataset)
        #dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
        newdataset = dataset['train']
        #print("flag2: ", newdataset)
        input_ids = newdataset['input_ids']
        #print("flag3: ", input_ids)
        dataset.set_format(type='torch', columns=['input_ids'])
        #print("flag4: ", dataset)
        return input_ids
        #return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
            cache_dir=cache_dir,
        )
Exemplo n.º 24
0
    def test_lm_tokenizer_without_padding(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        # ^ causal lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        with self.assertRaises(ValueError):
            # Expect error due to padding token missing on gpt2:
            data_collator.collate_batch(examples)

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))
Exemplo n.º 25
0
    def test_lm_tokenizer_with_padding(self):
        tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
        data_collator = DataCollatorForLanguageModeling(tokenizer)
        # ^ masked lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((31, 107)))
        self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((31, 107)))

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((2, 512)))
Exemplo n.º 26
0
 def _get_dataset(
     self,
     file_path: str,
     line_by_line: bool,
     block_size: int,
     overwrite_cache: bool,
 ) -> Dataset:
     if line_by_line:
         return LineByLineTextDataset(
             tokenizer=self.tokenizer, file_path=file_path, block_size=block_size
         )
     else:
         return TextDataset(
             tokenizer=self.tokenizer,
             file_path=file_path,
             block_size=block_size,
             overwrite_cache=overwrite_cache,
         )
Exemplo n.º 27
0
def finetune(tag):
    """fine-tune gpt2 on the given caption dataset"""
    global tokenizer
    config = AutoConfig.from_pretrained('gpt2')
    model = AutoModelWithLMHead.from_pretrained('gpt2', config=config)
    block_size = tokenizer.max_len
    # https://github.com/huggingface/transformers/blob/448c467256332e4be8c122a159b482c1ef039b98/src/transformers/data/datasets/language_modeling.py
    try:
        train_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=f'./text/training_text/{tag}.txt',
            block_size=block_size,
            overwrite_cache=True)
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=False)
        epochs = 8
        training_args = TrainingArguments(output_dir='logging/output',
                                          overwrite_output_dir=True,
                                          do_train=True,
                                          num_train_epochs=epochs,
                                          gradient_accumulation_steps=1,
                                          learning_rate=1e-4,
                                          per_gpu_train_batch_size=1,
                                          logging_steps=50,
                                          save_steps=0)
        set_seed(training_args.seed)
        trainer = Trainer(model=model,
                          args=training_args,
                          data_collator=data_collator,
                          train_dataset=train_dataset,
                          prediction_loss_only=True)
        with open(f'./logging/training_stats/training_{tag}.log', 'w') as log:
            sys.stdout = log
            trainer.train()
        sys.stdout = sys.__stdout__
        if not os.path.exists(f'./trained_models/{tag}/'):
            os.makedirs(f'./trained_models/{tag}/')
        # save the model
        model.save_pretrained(f'./trained_models/{tag}/')
        print('Done!')
    except AssertionError:
        print(
            f'The training text with the tag = {tag} does not exist. No model was trained!'
        )
Exemplo n.º 28
0
def get_dataset(args: DataTrainingArguments,
                model_name_or_path,
                tokenizer: PreTrainedTokenizer,
                evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        if 'dialogpt' in model_name_or_path.lower():
            return LineByLinePersonaChatDataset(tokenizer=tokenizer,
                                                file_path=file_path,
                                                block_size=args.block_size)
        else:
            return LineByLineTextDataset(tokenizer=tokenizer,
                                         file_path=file_path,
                                         block_size=args.block_size)
    else:
        return TextDataset(tokenizer=tokenizer,
                           file_path=file_path,
                           block_size=args.block_size,
                           overwrite_cache=args.overwrite_cache)
def get_dataset(args: DataTrainingArguments,
                tokenizer: PreTrainedTokenizer,
                inline_meta: str = None,
                local_rank=-1):
    file_path = args.eval_data_file
    if args.webtext:
        return WebTextPretokenizedDataset(tokenizer=tokenizer,
                                          file_path=file_path,
                                          block_size=args.block_size,
                                          inline_meta=inline_meta,
                                          local_rank=local_rank)
    elif args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer,
                                     file_path=file_path,
                                     block_size=args.block_size,
                                     local_rank=local_rank)
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            local_rank=local_rank,
        )
Exemplo n.º 30
0
 def __init__(self, file_path, tokenizer):
     self.ds = TextDataset(file_path=file_path,
                           tokenizer=tokenizer,
                           block_size=64)