Exemplos de TextDataset em Python, exemplos de transformers.TextDataset em Python

Exemplo n.º 1

0

Exibir arquivo

def get_dataloaders(model, tokenizer, batch_size, train_path, eval_path):
    block_size = 1024
    train_dataset = TextDataset(tokenizer=tokenizer,
                                file_path=train_path,
                                block_size=block_size)
    test_dataset = TextDataset(tokenizer=tokenizer,
                               file_path=eval_path,
                               block_size=block_size)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False,
                                                    mlm_probability=0.15)
    trainloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        collate_fn=data_collator,
        drop_last=False,
        num_workers=0,
    )
    testloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        collate_fn=data_collator,
        drop_last=False,
        num_workers=0,
    )
    return trainloader, testloader

Exemplo n.º 2

0

Exibir arquivo

def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):
    if tokenizer.model_max_length > 1e8:
        val_dataset = TextDataset(tokenizer=tokenizer,
                                  file_path=args.val_datapath,
                                  block_size=512)
        logger.info(
            f'[WARNING] tokenizer.model_max_length > 10^8: {tokenizer.model_max_length} setting the value as 512 instead.'
        )
    else:
        val_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=args.val_datapath,
            block_size=tokenizer.model_max_length
        )  #  The `max_len` attribute has been deprecated

    if eval_only:
        train_dataset = val_dataset
    else:
        logger.info(
            f'Loading and tokenizing training data is usually slow: {args.train_datapath}'
        )
        if tokenizer.model_max_length > 1e8:
            train_dataset = TextDataset(tokenizer=tokenizer,
                                        file_path=args.train_datapath,
                                        block_size=512)
            logger.info(
                f'[WARNING] tokenizer.model_max_length > 10^8: {tokenizer.model_max_length} setting the value as 512 instead.'
            )
        else:
            train_dataset = TextDataset(tokenizer=tokenizer,
                                        file_path=args.train_datapath,
                                        block_size=tokenizer.model_max_length)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        prediction_loss_only=True,
    )

    eval_loss = trainer.evaluate()
    #pdb.set_trace()
    eval_loss = eval_loss['eval_loss']
    logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')

    if not eval_only:
        trainer.train(model_path=model_path)
        trainer.save_model()

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')

Exemplo n.º 3

0

Exibir arquivo

Arquivo: trainer.py Projeto: xport456a547/transformers-bench

def pretrain_and_evaluate(training_args, dataset_args, model, tokenizer, eval_only):
    """
    # adapted from https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing#scrollTo=N8J-TLhBuaOf
    :param training_args: HF training args object
    :param dataset_args: object storing dataset config, requires train_datapath and val_datapath to be defined
    :param model: transformers.PreTrainedModel
    :param tokenizer: PreTrainedTokenizerBase
    :param eval_only: boolean, True only performs evaluation
    :return:
    """

    val_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=dataset_args.val_datapath,
        block_size=tokenizer.model_max_length,
    )
    if eval_only:
        train_dataset = val_dataset
    else:
        logging.info(
            f"Loading and tokenizing training data is usually slow: {dataset_args.train_datapath}"
        )
        train_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=dataset_args.train_datapath,
            block_size=tokenizer.model_max_length,
        )

    # https://github.com/huggingface/transformers/blob/master/src/transformers/data/data_collator.py
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    # https://huggingface.co/transformers/_modules/transformers/trainer.html
    trainer = Trainer_(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    metrics = trainer.evaluate()
    # eval_loss = metrics["eval_loss"]
    # logging.info(f"Initial eval bpc: {eval_loss / math.log(2)}")
    logging.info(f"Initial metrics: {metrics}")

    if not eval_only:
        # to change if we want to continue training existing models
        # same path as from_checkpoint argument from the builder
        trainer.train(model_path=None)

        trainer.save_model()

        metrics = trainer.evaluate()
        eval_loss = metrics["eval_loss"]
        logging.info(f"Eval bpc after pretraining: {eval_loss / math.log(2)}")

Exemplo n.º 4

0

Exibir arquivo

Arquivo: train.py Projeto: jemiaymen/gpt2_tn

    def load_dataset(self, block_size=128):

        self.train_dataset = TextDataset(tokenizer=self.tokenizer,
                                         file_path=self.train,
                                         block_size=block_size)
        self.dev_dataset = TextDataset(tokenizer=self.tokenizer,
                                       file_path=self.dev,
                                       block_size=block_size)
        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer, mlm=False)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: train.py Projeto: mathematiguy/nz-herald

def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer, file_path=train_path, block_size=128
    )

    test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_path, block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    return train_dataset, test_dataset, data_collator

Exemplo n.º 6

0

Exibir arquivo

    def _dataset(file_path, ref_path=None):
        if args.line_by_line:
            if ref_path is not None:
                if not args.whole_word_mask or not args.mlm:
                    raise ValueError(
                        "You need to set world whole masking and mlm to True for Chinese Whole Word Mask"
                    )
                return LineByLineWithRefDataset(
                    tokenizer=tokenizer,
                    file_path=file_path,
                    block_size=args.block_size,
                    ref_path=ref_path,
                )

            return LineByLineTextDataset(tokenizer=tokenizer,
                                         file_path=file_path,
                                         block_size=args.block_size)
        else:
            return TextDataset(
                tokenizer=tokenizer,
                file_path=file_path,
                block_size=args.block_size,
                overwrite_cache=args.overwrite_cache,
                cache_dir=cache_dir,
            )

Exemplo n.º 7

0

Exibir arquivo

Arquivo: run_language_modeling.py Projeto: taeminlee/train_KoGPT2

def get_dataset(args: DataTrainingArguments,
                tokenizer: PreTrainedTokenizer,
                max_len,
                evaluate=False):

    file_path = args.eval_data_file if evaluate else args.train_data_file

    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer,
                                     file_path=file_path,
                                     block_size=args.block_size)
    elif args.text_dataset:
        return TextDataset(tokenizer=tokenizer,
                           file_path=file_path,
                           block_size=args.block_size,
                           overwrite_cache=args.overwrite_cache)
    else:
        """
        When use common tab separated text dataset, use nlp.data.TSVDataset.
        If you want to use other type of dataset, refer to other class of nlp.data,
        or set DataTrainingArguments.line_by_line or DataTrainingArguments.text_dataset True.
        """
        dataset = nlp.data.TSVDataset(file_path,
                                      field_indices=[1],
                                      num_discard_samples=1)

        return Get_dataset(dataset, 0, tokenizer, max_len, True, False)

Exemplo n.º 8

0

Exibir arquivo

    def test_plm(self):
        tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
        data_collator = DataCollatorForPermutationLanguageModeling(tokenizer)
        # ^ permutation lm

        dataset = LineByLineTextDataset(tokenizer,
                                        file_path=PATH_SAMPLE_TEXT,
                                        block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((31, 112)))
        self.assertEqual(batch["perm_mask"].shape, torch.Size((31, 112, 112)))
        self.assertEqual(batch["target_mapping"].shape,
                         torch.Size((31, 112, 112)))
        self.assertEqual(batch["labels"].shape, torch.Size((31, 112)))

        dataset = TextDataset(tokenizer,
                              file_path=PATH_SAMPLE_TEXT,
                              block_size=512,
                              overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 512, 512)))
        self.assertEqual(batch["target_mapping"].shape,
                         torch.Size((2, 512, 512)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))

        example = [torch.randint(5, [5])]
        with self.assertRaises(ValueError):
            # Expect error due to odd sequence length
            data_collator(example)

Exemplo n.º 9

0

Exibir arquivo

def get_dataset(filepath,
                tokenizer,
                block_size,
                line_by_line=False,
                overwrite_cache=False):
    '''
    Load a dataset from the specified filepath.

    :param filepath:
        The filepath of the dataset.
    :param tokenizer:
        The tokenizer to parse the dataset with.
    :param block_size:
        The length of a single input sequence (block).
    :param line_by_line:
        Indicates whether distinct lines of text in the dataset are to be handled as
        separate sequence (i.e. whether to add the BOS adn EOS tokens to each line).
        Defaults to False.
    :param overwrite_cache:
        Overwrite the cached training and evaluation sets. Defaults to False.
    :returns:
        A :class:`torch.utils.data.Dataset` object.

    '''

    if line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer,
                                     file_path=filepath,
                                     block_size=block_size)
    else:
        return TextDataset(tokenizer=tokenizer,
                           file_path=filepath,
                           block_size=block_size,
                           overwrite_cache=overwrite_cache)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: run.py Projeto: nokia/nlc2cmd-submission-hubris

def main():
    print("PREPROCESSING DATA")
    preprocess()
    print("LOADING TOKENIZER")
    tokenizer = get_tokenizer()
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False)
    print("LOADING MODEL", cfg('model'))
    model = get_model(tokenizer)

    print("LOADING DATA")
    if cfg('encoding') == 'LBL':
        train_dataset = LBLDataset(tokenizer=tokenizer,
                                   file_path=filename('train'))
    elif cfg('encoding') == 'blocked':
        train_dataset = BlockedDataset(tokenizer=tokenizer,
                                       file_path=filename('train'))
    elif cfg('encoding') == 'text':
        train_dataset = TextDataset(tokenizer=tokenizer,
                                    file_path=filename('train'),
                                    block_size=cfg('max_block'))
    elif cfg('encoding').startswith('inter'):
        if cfg('encoding').endswith('LBL'):
            loader = LBLDataset
        elif cfg('encoding').endswith('blocked'):
            loader = BlockedDataset

        d1 = loader(tokenizer=tokenizer, file_path=filename('train'))
        d2 = loader(tokenizer=tokenizer, file_path=filename('dirty'))
        train_dataset = CombinedDataset(d1, d2)
    else:
        raise ValueError("Unkown encoding")

    trainer = get_trainer(train_dataset, data_collator, model)

    def validator(x, y):
        global BEST_metric
        model.save_pretrained(session)
        metric, pred = validate(model, tokenizer, x, y)
        if np.mean(metric) > BEST_metric:
            print("NEW BEST (saving)")
            BEST_metric = np.mean(metric)

        # save predicitions and model
        save(session + "metric.txt", str(metric) + "\n")
        save(session + "pred.txt", str(pred) + "\n\n")
        return metric, pred

    trainer.validator = validator
    trainer.val_dataset = get_validation_data()

    # saving configuration
    print("SAVING...")
    session = get_session_path()
    print(session)
    save(session + "conf.txt", repr(cfg()))

    print("STARTING TRAINING...")
    trainer.train()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: main.py Projeto: JokusPokus/TINART-finetuning

def load_dataset(train_path, test_path, tokenizer):
    """
    Loads training and validation data from text files into a TextDataset.
    """
    train_dataset = TextDataset(tokenizer=tokenizer,
                                file_path=train_path,
                                block_size=128)

    test_dataset = TextDataset(tokenizer=tokenizer,
                               file_path=test_path,
                               block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    return train_dataset, test_dataset, data_collator

Exemplo n.º 12

0

Exibir arquivo

Arquivo: run_language_modeling_with_new_vocab.py Projeto: ahmetustun/adapter-transformers

def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache
        )

Exemplo n.º 13

0

Exibir arquivo

    def pretrain_and_evaluate(args,
                              model,
                              tokenizer,
                              eval_only,
                              model_path=None,
                              block_size=''):
        val_dataset = TextDataset(tokenizer=tokenizer,
                                  file_path=args.val_datapath,
                                  block_size=tokenizer.max_len)
        if eval_only:
            train_dataset = val_dataset
        else:
            logger.info(
                f'Loading and tokenizing training data is usually slow: {args.train_datapath}'
            )
            train_dataset = TextDataset(tokenizer=tokenizer,
                                        file_path=args.train_datapath,
                                        block_size=tokenizer.max_len)

        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=True,
                                                        mlm_probability=0.15)
        trainer = Trainer(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            prediction_loss_only=True,
        )

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')
        logger.info(f'Initial perplexity: {math.exp(eval_loss)}')

        if not eval_only:
            trainer.train(model_path=model_path)
            trainer.save_model()

            eval_loss = trainer.evaluate()
            eval_loss = eval_loss['eval_loss']
            logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')
            logger.info(
                f'Eval perplexity after pretraining: {math.exp(eval_loss)}')

Exemplo n.º 14

0

Exibir arquivo

def get_dataset(file_path,
                tokenizer: PreTrainedTokenizer,
                block_size: int = None) -> TextDataset:
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size if block_size is not None else tokenizer.max_len,
        overwrite_cache=True,
    )

Exemplo n.º 15

0

Exibir arquivo

Arquivo: trainer.py Projeto: AaronGrainer/gpt2-twitter-cloud-run

    def _load(self):
        """Load Tokenizer, Dataset, Model and Jit input example
        """
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint)
        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )
        self.gpt2_dataset = dict(train=TextDataset(tokenizer=self.tokenizer,
                                                   file_path=self.dataset,
                                                   block_size=128),
                                 validation=TextDataset(
                                     tokenizer=self.tokenizer,
                                     file_path=self.dataset,
                                     block_size=128))

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_checkpoint)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: run_language_modeling_v0.py Projeto: cleopatra-itn/SentimentAnalyserLVTwitter

def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False, local_rank=-1):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank
        )
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank,
        )

Exemplo n.º 17

0

Exibir arquivo

 def _dataset(file_path):
     if args.line_by_line:
         return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
     else:
         return TextDataset(
             tokenizer=tokenizer,
             file_path=file_path,
             block_size=args.block_size,
             overwrite_cache=args.overwrite_cache,
         )

Exemplo n.º 18

0

Exibir arquivo

def pretrain_and_evaluate(training_args, model, tokenizer, eval_only,
                          model_path_out):
    logger.info(f'Loading and tokenizing data is usually slow: {VAL_FPATH}')

    val_dataset = TextDataset(tokenizer=tokenizer,
                              file_path=VAL_FPATH,
                              block_size=tokenizer.max_len)

    if eval_only:
        train_dataset = val_dataset
    else:
        logger.info(
            f'Loading and tokenizing training data is usually slow: {TRAIN_FPATH}'
        )
        train_dataset = TextDataset(tokenizer=tokenizer,
                                    file_path=TRAIN_FPATH,
                                    block_size=tokenizer.max_len)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    logger.warning(f'Model Params set to {training_args}')

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset,
                      eval_dataset=val_dataset,
                      prediction_loss_only=True)

    eval_loss = trainer.evaluate()
    eval_loss = eval_loss['eval_loss']
    logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')

    if not eval_only:
        trainer.train(model_path=model_path_out)
        trainer.save_model()

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')

Exemplo n.º 19

0

Exibir arquivo

def _load_dataset():

    train_dataset = TextDataset(tokenizer=tokenizer,
                                file_path=cfg.DATA,
                                block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    return train_dataset, data_collator

Exemplo n.º 20

0

Exibir arquivo

def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer,model_args:ModelArguments, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        if args.mlm_sample_times > 1:
            return FullyLineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, cache_dir=model_args.cache_dir)
        else:
            return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, cache_dir=model_args.cache_dir)
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache
        )

Exemplo n.º 21

0

Exibir arquivo

Arquivo: lang_acq_gpt_train.py Projeto: lee-jhwn/gpt_acquisition

def load_dataset(path, tokenizer, seq_len):

    print('loading dataset')
    dataset = TextDataset(tokenizer=tokenizer,
                          file_path=path,
                          block_size=seq_len)

    print('dataset loaded')

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    print('data collator ready')

    return dataset, data_collator

Exemplo n.º 22

0

Exibir arquivo

def get_dataset(
    args: DataTrainingArguments,
    tokenizer: PreTrainedTokenizer,
    evaluate: bool = False,
    cache_dir: Optional[str] = None,
):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
            cache_dir=cache_dir,
        )

Exemplo n.º 23

0

Exibir arquivo

def get_dataset(
    args: DataTrainingArguments,
    tokenizer: PreTrainedTokenizer,
    evaluate: bool = False,
    cache_dir: Optional[str] = None,
):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        print("entering get_dataset function with line_by_line")
        import glob
        if evaluate:
            files = glob.glob(file_path + 'shard_*')
        else:
            files = glob.glob(file_path + 'shard_*')
        from datasets import load_dataset
        dataset = load_dataset('text', data_files=files)

        #(On passing the whole dataset file (11GB) directly to load_dataset was resulting into RAM issue)

        #Tokenization
        def encode(examples):
            #return tokenizer(examples['text'], truncation=True, padding='max_length')
            return tokenizer(examples['text'],
                             add_special_tokens=True,
                             truncation=True,
                             max_length=args.block_size)

        dataset = dataset.map(encode, batched=True)
        #print("flag1: ", dataset)
        #dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
        newdataset = dataset['train']
        #print("flag2: ", newdataset)
        input_ids = newdataset['input_ids']
        #print("flag3: ", input_ids)
        dataset.set_format(type='torch', columns=['input_ids'])
        #print("flag4: ", dataset)
        return input_ids
        #return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
            cache_dir=cache_dir,
        )

Exemplo n.º 24

0

Exibir arquivo

Arquivo: test_trainer.py Projeto: guacamolia/memory-bert

    def test_lm_tokenizer_without_padding(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        # ^ causal lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        with self.assertRaises(ValueError):
            # Expect error due to padding token missing on gpt2:
            data_collator.collate_batch(examples)

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))

Exemplo n.º 25

0

Exibir arquivo

Arquivo: test_trainer.py Projeto: guacamolia/memory-bert

    def test_lm_tokenizer_with_padding(self):
        tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
        data_collator = DataCollatorForLanguageModeling(tokenizer)
        # ^ masked lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((31, 107)))
        self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((31, 107)))

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((2, 512)))

Exemplo n.º 26

0

Exibir arquivo

Arquivo: language_model.py Projeto: transtrades/adaptnlp

 def _get_dataset(
     self,
     file_path: str,
     line_by_line: bool,
     block_size: int,
     overwrite_cache: bool,
 ) -> Dataset:
     if line_by_line:
         return LineByLineTextDataset(
             tokenizer=self.tokenizer, file_path=file_path, block_size=block_size
         )
     else:
         return TextDataset(
             tokenizer=self.tokenizer,
             file_path=file_path,
             block_size=block_size,
             overwrite_cache=overwrite_cache,
         )

Exemplo n.º 27

0

Exibir arquivo

def finetune(tag):
    """fine-tune gpt2 on the given caption dataset"""
    global tokenizer
    config = AutoConfig.from_pretrained('gpt2')
    model = AutoModelWithLMHead.from_pretrained('gpt2', config=config)
    block_size = tokenizer.max_len
    # https://github.com/huggingface/transformers/blob/448c467256332e4be8c122a159b482c1ef039b98/src/transformers/data/datasets/language_modeling.py
    try:
        train_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=f'./text/training_text/{tag}.txt',
            block_size=block_size,
            overwrite_cache=True)
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=False)
        epochs = 8
        training_args = TrainingArguments(output_dir='logging/output',
                                          overwrite_output_dir=True,
                                          do_train=True,
                                          num_train_epochs=epochs,
                                          gradient_accumulation_steps=1,
                                          learning_rate=1e-4,
                                          per_gpu_train_batch_size=1,
                                          logging_steps=50,
                                          save_steps=0)
        set_seed(training_args.seed)
        trainer = Trainer(model=model,
                          args=training_args,
                          data_collator=data_collator,
                          train_dataset=train_dataset,
                          prediction_loss_only=True)
        with open(f'./logging/training_stats/training_{tag}.log', 'w') as log:
            sys.stdout = log
            trainer.train()
        sys.stdout = sys.__stdout__
        if not os.path.exists(f'./trained_models/{tag}/'):
            os.makedirs(f'./trained_models/{tag}/')
        # save the model
        model.save_pretrained(f'./trained_models/{tag}/')
        print('Done!')
    except AssertionError:
        print(
            f'The training text with the tag = {tag} does not exist. No model was trained!'
        )

Exemplo n.º 28

0

Exibir arquivo

def get_dataset(args: DataTrainingArguments,
                model_name_or_path,
                tokenizer: PreTrainedTokenizer,
                evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        if 'dialogpt' in model_name_or_path.lower():
            return LineByLinePersonaChatDataset(tokenizer=tokenizer,
                                                file_path=file_path,
                                                block_size=args.block_size)
        else:
            return LineByLineTextDataset(tokenizer=tokenizer,
                                         file_path=file_path,
                                         block_size=args.block_size)
    else:
        return TextDataset(tokenizer=tokenizer,
                           file_path=file_path,
                           block_size=args.block_size,
                           overwrite_cache=args.overwrite_cache)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: run_language_modeling_webtext.py Projeto: ml-research/MoRT_NMI

def get_dataset(args: DataTrainingArguments,
                tokenizer: PreTrainedTokenizer,
                inline_meta: str = None,
                local_rank=-1):
    file_path = args.eval_data_file
    if args.webtext:
        return WebTextPretokenizedDataset(tokenizer=tokenizer,
                                          file_path=file_path,
                                          block_size=args.block_size,
                                          inline_meta=inline_meta,
                                          local_rank=local_rank)
    elif args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer,
                                     file_path=file_path,
                                     block_size=args.block_size,
                                     local_rank=local_rank)
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            local_rank=local_rank,
        )

Exemplo n.º 30

0

Exibir arquivo

Arquivo: test_trainer.py Projeto: rosafish/HF-transformers-20

 def __init__(self, file_path, tokenizer):
     self.ds = TextDataset(file_path=file_path,
                           tokenizer=tokenizer,
                           block_size=64)