示例#1
0
    def __init__(
        self,
        model_name: str,
        mlm: bool = True,
        mlm_probability: float = 0.15,
        filed_name: str = "source",
        namespace: str = "tokens",
    ):
        self._field_name = filed_name
        self._namespace = namespace
        from allennlp.common import cached_transformers

        tokenizer = cached_transformers.get_tokenizer(model_name)
        self._collator = DataCollatorForLanguageModeling(
            tokenizer, mlm, mlm_probability)
示例#2
0
    def get_loaders(
        self,
        stage: str,
        epoch: int = None,
    ) -> "OrderedDict[str, DataLoader]":
        """
        Returns loaders for the stage
        Args:
            stage: string with stage name
            epoch: epoch
        Returns:
            Dict of loaders
        """
        data_params = dict(self.stages_config[stage]["data_params"])
        model_name = data_params["model_name"]
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        collate_fn = DataCollatorForLanguageModeling(tokenizer)
        loaders_params = {
            "train": {
                "collate_fn": collate_fn
            },
            "valid": {
                "collate_fn": collate_fn
            },
        }
        loaders = utils.get_loaders_from_params(
            get_datasets_fn=self.get_datasets,
            initial_seed=self.initial_seed,
            stage=stage,
            loaders_params=loaders_params,
            **data_params,
        )

        return loaders
示例#3
0
class LanguageModelingDataCollator(DataCollator):
    """
    Register as an `DataCollator` with name `LanguageModelingDataCollator`
    Used for language modeling.
    """
    def __init__(
        self,
        model_name: str,
        mlm: bool = True,
        mlm_probability: float = 0.15,
        filed_name: str = "source",
        namespace: str = "tokens",
    ):
        self._field_name = filed_name
        self._namespace = namespace
        from allennlp.common import cached_transformers

        tokenizer = cached_transformers.get_tokenizer(model_name)
        self._collator = DataCollatorForLanguageModeling(
            tokenizer, mlm, mlm_probability)

    def __call__(self, instances: List[Instance]) -> TensorDict:
        tensor_dicts = allennlp_collate(instances)
        tensor_dicts = self.process_tokens(tensor_dicts)
        return tensor_dicts

    def process_tokens(self, tensor_dicts: TensorDict) -> TensorDict:
        inputs = tensor_dicts[self._field_name][self._namespace]["token_ids"]
        inputs, labels = self._collator.mask_tokens(inputs)
        tensor_dicts[self._field_name][self._namespace]["token_ids"] = inputs
        tensor_dicts[self._field_name][self._namespace]["labels"] = labels
        return tensor_dicts
示例#4
0
 def _train_model(self, model, tokenizer, train_dataset, val_dataset, **train_kwargs):
     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
     train_args = self._get_train_args(**train_kwargs)
     trainer = transformers.Trainer(model=model,
                                    args=train_args,
                                    data_collator=data_collator,
                                    train_dataset=train_dataset,
                                    eval_dataset=val_dataset,
                                    )
     trainer.train()
示例#5
0
    def __init__(
        self,
        model_name: str,
        mlm: bool = True,
        mlm_probability: float = 0.15,
        filed_name: str = "source",
        namespace: str = "tokens",
    ):
        self._field_name = filed_name
        self._namespace = namespace
        from allennlp.common import cached_transformers

        tokenizer = cached_transformers.get_tokenizer(model_name)
        self._collator = DataCollatorForLanguageModeling(tokenizer, mlm, mlm_probability)
        if hasattr(self._collator, "mask_tokens"):
            # For compatibility with transformers < 4.10
            self._mask_tokens = self._collator.mask_tokens
        else:
            self._mask_tokens = self._collator.torch_mask_tokens
def test_is_running():
    """Test if perplexity is running normal"""
    tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModelWithLMHead.from_pretrained("distilbert-base-uncased")
    dataset = LanguageModelingDataset(texts, tok)
    collate_fn = DataCollatorForLanguageModeling(tok).collate_batch
    dataloader = torch.utils.data.DataLoader(dataset, collate_fn=collate_fn)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

    runner = HuggingFaceRunner()
    runner.train(
        model=model,
        optimizer=optimizer,
        loaders={"train": dataloader},
        callbacks={
            "optimizer": dl.OptimizerCallback(),
            "perplexity": PerplexityMetricCallback(),
        },
        check=True,
    )
示例#7
0
    def __init__(self,
                 dataset: Dataset,
                 device: torch.device,
                 num_labels: int = 2,
                 averaging: AnyStr = 'binary',
                 pad_token_id: int = None,
                 mlm: bool = False,
                 multi_gpu: bool = False,
                 sequence_modeling: bool = False,
                 ensemble_edu: bool = False,
                 ensemble_sent: bool = False):
        self.dataset = dataset
        if isinstance(dataset, Subset):
            self.all_labels = list(dataset.dataset.getLabels(dataset.indices))
        else:
            self.all_labels = dataset.getLabels()
        if sequence_modeling:
            collator = collate_sequence_batch_transformer
        else:
            collator = collate_batch_transformer

        if mlm:
            collate_fn = DataCollatorForLanguageModeling(dataset.tokenizer)
        elif pad_token_id is None:
            collate_fn = partial(collator, dataset.tokenizer.pad_token_id)
        else:
            collate_fn = partial(collator, pad_token_id)

        self.dataloader = DataLoader(dataset,
                                     batch_size=32,
                                     collate_fn=collate_fn)
        self.device = device
        self.averaging = averaging
        self.num_labels = num_labels
        self.mlm = mlm
        self.pad_token_id = pad_token_id
        self.multi_gpu = multi_gpu
        self.sequence_modeling = sequence_modeling
        self.ensemble_edu = ensemble_edu
        self.ensemble_sent = ensemble_sent
示例#8
0
def test_runner():
    """Test that runner executes"""
    train_df = pd.read_csv("data/train.csv")
    valid_df = pd.read_csv("data/valid.csv")
    teacher_config = AutoConfig.from_pretrained("bert-base-uncased",
                                                output_hidden_states=True,
                                                output_logits=True)
    teacher = BertForMaskedLM.from_pretrained("bert-base-uncased",
                                              config=teacher_config)

    student_config = AutoConfig.from_pretrained(
        "distilbert-base-uncased",
        output_hidden_states=True,
        output_logits=True,
    )
    student = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased",
                                                    config=student_config)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    train_dataset = LanguageModelingDataset(train_df["text"], tokenizer)
    valid_dataset = LanguageModelingDataset(valid_df["text"], tokenizer)

    collate_fn = DataCollatorForLanguageModeling(tokenizer)
    train_dataloader = DataLoader(train_dataset,
                                  collate_fn=collate_fn,
                                  batch_size=2)
    valid_dataloader = DataLoader(valid_dataset,
                                  collate_fn=collate_fn,
                                  batch_size=2)
    loaders = {"train": train_dataloader, "valid": valid_dataloader}

    callbacks = {
        "masked_lm_loss":
        MaskedLanguageModelCallback(),
        "mse_loss":
        MSELossCallback(),
        "cosine_loss":
        CosineLossCallback(),
        "kl_div_loss":
        KLDivLossCallback(),
        "loss":
        MetricAggregationCallback(
            prefix="loss",
            mode="weighted_sum",
            metrics={
                "cosine_loss": 1.0,
                "masked_lm_loss": 1.0,
                "kl_div_loss": 1.0,
                "mse_loss": 1.0,
            },
        ),
        "optimizer":
        dl.OptimizerCallback(),
        "perplexity":
        PerplexityMetricCallbackDistillation(),
    }

    model = torch.nn.ModuleDict({"teacher": teacher, "student": student})
    runner = DistilMLMRunner()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
    runner.train(
        model=model,
        optimizer=optimizer,
        loaders=loaders,
        verbose=True,
        check=True,
        callbacks=callbacks,
    )
    assert True
示例#9
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    '''
    --output_dir=output     --model_type=gpt2      --model_name_or_path=gpt2     --do_train      --train_data_file=/wiki.train.raw      --do_eval      --eval_data_file=/wiki.test.raw
    '''

    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")


# 下载权重.
    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    model.resize_token_embeddings(len(tokenizer))  # 就是之前的复用,然后新加的用init生成参数.

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
            "--mlm flag (masked language modeling).")

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
            tokenizer=tokenizer,
            plm_probability=data_args.plm_probability,
            max_span_length=data_args.max_span_length,
        )
    else:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=data_args.mlm,
            mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
示例#10
0
def get_data_collator():
    return DataCollatorForLanguageModeling(tokenizer=tokenizer)
示例#11
0
from distiller import Distiller

# Getting dataset
df = pd.read_csv("./data/SST-2/train.tsv", encoding='utf-8', sep='\t')
# len df is 67.349, since we are working on cpu we only take 3000
train_df = df.iloc[:3000]

# Getting teachers tokenizer and preparing data
teacher_model_name = "bert-base-uncased"
student_model_name = "distilbert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(teacher_model_name)

dataset = LanguageModelingDataset(train_df["sentence"],
                                  teacher_model_name,
                                  sort=False)
collate_fn = DataCollatorForLanguageModeling(tokenizer)
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=32)

# Getting teacher and student model
teacher = BertForMaskedLM.from_pretrained(teacher_model_name)
student = DistilBertForMaskedLM.from_pretrained(student_model_name)

# needed paramteres for training
params = {
    "n_epoch": 3,
    "temperature": 2.0,
    "alpha_ce": 0.5,
    "alpha_mlm": 2.0,
    "alpha_cos": 1.0,
    "alpha_mse": 1.0,
    "gradient_accumulation_steps": 50,