Exemplo n.º 1
0
def get_modules(params_dict):
    modules = {}
    params = copy.deepcopy(params_dict)
    params["attention_probs_dropout_prob"] = params.pop("dropout")

    # bert, roberta, electra self attentions have the same code.

    torch.manual_seed(1234)
    hf_module = BertSelfAttention(BertConfig(**params))
    modules["bert"] = hf_module

    torch.manual_seed(1234)
    hf_module = RobertaSelfAttention(RobertaConfig(**params))
    modules["roberta"] = hf_module

    torch.manual_seed(1234)
    hf_module = ElectraSelfAttention(ElectraConfig(**params))
    modules["electra"] = hf_module

    torch.manual_seed(1234)
    distilparams = copy.deepcopy(params_dict)
    distilparams["n_heads"] = distilparams.pop("num_attention_heads")
    distilparams["dim"] = distilparams.pop("hidden_size")
    distilparams["attention_dropout"] = distilparams.pop("dropout")
    hf_module = MultiHeadSelfAttention(DistilBertConfig(**distilparams))
    modules["distilbert"] = hf_module

    return modules
Exemplo n.º 2
0
def test_replace_embeddings():
    """
    Replace a model with a TFBertEmbeddings layer, which although it has extra arguments in init, it
    doesn't have overwritten the get_config() method.
    :return: None
    """
    batch_size = 2
    sequence_length = 128
    config = BertConfig()

    def copy_weights_tf_bert_embeddings(layer, new_layer):
        new_layer.build((batch_size, sequence_length, new_layer.hidden_size))
        new_layer.set_weights(layer.get_weights())

    set_random_seeds()
    to_replace_dict = {
        TFBertEmbeddings: {
            "new_class": IpuTFBertEmbeddings,
            "new_params": {
                "config": config,
                "serialization_factor": 2
            },
            "copy_weights": True,
            "copy_weights_func": copy_weights_tf_bert_embeddings
        }
    }
    model_func_args = {"len_seq": sequence_length, "config": config}

    cfg = ipu.config.IPUConfig()
    cfg.device_connection.type = ipu.config.DeviceConnectionType.ON_DEMAND
    cfg.configure_ipu_system()
    strategy = ipu.ipu_strategy.IPUStrategy()
    with strategy.scope():
        check_replace_layers(to_replace_dict, func_model_tf_bert_embeddings,
                             model_func_args, batch_size)
Exemplo n.º 3
0
    def __init__(self,
                 args,
                 tokenizer: BertTokenizer,
                 object_features_variant=False,
                 positional_embed_variant=False,
                 latent_transformer=False):
        super().__init__()

        self.args = args
        self.tokenizer = tokenizer

        self.image_projection = nn.Sequential(
            nn.Linear(512, 768), nn.BatchNorm1d(768, momentum=0.01))

        config = BertConfig.from_pretrained('bert-base-uncased')
        self.tokenizer = tokenizer
        self.embeddings = BertEmbeddings(config)

        self.text_encoder = BertModel.from_pretrained("bert-base-uncased",
                                                      return_dict=True)
        self.decoder = BertLMHeadModel.from_pretrained(
            'bert-base-uncased',
            is_decoder=True,
            use_cache=True,
            add_cross_attention=True)

        if object_features_variant:
            self.image_transformer = ImageTransformerEncoder(args)

        self.positional_embed = True if positional_embed_variant else False

        self.latent_transformer = latent_transformer
Exemplo n.º 4
0
    def __init__(self, add_pooling_layer=True):
        config = BertConfig()
        super().__init__(config)
        self.config = config

        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)

        self.pooler = BertPooler(config) if add_pooling_layer else None

        self.init_weights()
Exemplo n.º 5
0
def get_attention_modules():
    params = copy.deepcopy(ATTENTION_PARAMS_DICT)
    params["attention_probs_dropout_prob"] = params.pop("attention_dropout")
    params["hidden_dropout_prob"] = params.pop("hidden_dropout")

    torch.manual_seed(1234)
    yield "bert", BertAttention(BertConfig(**params)).eval()

    torch.manual_seed(1234)
    yield "roberta", RobertaAttention(RobertaConfig(**params)).eval()

    torch.manual_seed(1234)
    yield "electra", ElectraAttention(ElectraConfig(**params)).eval()
Exemplo n.º 6
0
def get_layer_modules():
    params = copy.deepcopy(LAYER_PARAMS_DICT)
    params["attention_probs_dropout_prob"] = params.pop("attention_dropout")
    params["hidden_dropout_prob"] = params.pop("hidden_dropout")
    params["hidden_act"] = params.pop("activation")

    torch.manual_seed(1234)
    yield "bert", BertLayer(BertConfig(**params)).eval()

    torch.manual_seed(1234)
    yield "roberta", RobertaLayer(RobertaConfig(**params)).eval()

    torch.manual_seed(1234)
    yield "electra", ElectraLayer(ElectraConfig(**params)).eval()
Exemplo n.º 7
0
def get_modules():
    params = copy.deepcopy(PARAMS_DICT)

    params["hidden_dropout_prob"] = params.pop("dropout")
    params["hidden_size"] = params.pop("embedding_size")

    # bert, roberta, electra self attentions have the same code.

    torch.manual_seed(1234)
    yield "bert", BertEmbeddings(BertConfig(**params))

    albertparams = copy.deepcopy(PARAMS_DICT)
    albertparams["hidden_dropout_prob"] = albertparams.pop("dropout")

    torch.manual_seed(1234)
    yield "albert", AlbertEmbeddings(AlbertConfig(**albertparams))
Exemplo n.º 8
0
def test_replace_lm_prediction_head():
    """
    Replace a model with a TFBertLMPredictionHead layer.
    :return: None
    """
    batch_size = 2
    sequence_length = 128
    config = BertConfig()

    cfg = ipu.config.IPUConfig()
    cfg.device_connection.type = ipu.config.DeviceConnectionType.ON_DEMAND
    cfg.configure_ipu_system()
    strategy = ipu.ipu_strategy.IPUStrategy()
    with strategy.scope():

        model = get_bert_lm_prediction_head_model(config,
                                                  TFBertLMPredictionHead)

        def copy_weights_tf_bert_lm_prediction_head(layer, new_layer):
            new_layer.build(
                (batch_size, sequence_length, new_layer.hidden_size))
            new_layer.set_weights(layer.get_weights())

        set_random_seeds()
        to_replace_dict = {
            TFBertLMPredictionHead: {
                "new_class": IpuTFBertLMPredictionHead,
                "new_params": {
                    "config": config,
                    "input_embeddings": lambda: model.embedding,
                    "serialization_factor": 2,
                },
                "copy_weights": True,
                "copy_weights_func": copy_weights_tf_bert_lm_prediction_head
            },
        }
        model_func_args = {
            "len_seq": sequence_length,
            "config": config,
            "model": model
        }
        check_replace_layers(to_replace_dict,
                             func_model_tf_bert_lm_prediction_head,
                             model_func_args, batch_size)
Exemplo n.º 9
0
def get_modules(params_dict):
    modules = {}
    params = copy.deepcopy(params_dict)
    params["attention_probs_dropout_prob"] = params.pop("attention_dropout")
    params["hidden_dropout_prob"] = params.pop("hidden_dropout")

    torch.manual_seed(1234)
    hf_module = BertEncoder(BertConfig(**params))
    modules["bert"] = hf_module

    torch.manual_seed(1234)
    hf_module = RobertaEncoder(RobertaConfig(**params))
    modules["roberta"] = hf_module

    torch.manual_seed(1234)
    hf_module = ElectraEncoder(ElectraConfig(**params))
    modules["electra"] = hf_module

    return modules
Exemplo n.º 10
0
    def __init__(
        self,
        vocab: Vocabulary,
        bert_model: Union[str, Dict[str, Any], BertModel],
        embedding_dropout: float = 0.0,
        initializer: InitializerApplicator = InitializerApplicator(),
        label_smoothing: float = None,
        ignore_span_metric: bool = False,
        srl_eval_path: str = DEFAULT_SRL_EVAL_PATH,
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)

        if isinstance(bert_model, str):
            self.bert_model = BertModel.from_pretrained(bert_model)
        elif isinstance(bert_model, dict):
            warnings.warn(
                "Initializing BertModel without pretrained weights. This is fine if you're loading "
                "from an AllenNLP archive, but not if you're training.",
                UserWarning,
            )
            bert_config = BertConfig.from_dict(bert_model)
            self.bert_model = BertModel(bert_config)
        else:
            self.bert_model = bert_model

        self.num_classes = self.vocab.get_vocab_size("labels")
        if srl_eval_path is not None:
            # For the span based evaluation, we don't want to consider labels
            # for verb, because the verb index is provided to the model.
            self.span_metric = SrlEvalScorer(srl_eval_path,
                                             ignore_classes=["V"])
        else:
            self.span_metric = None
        self.tag_projection_layer = Linear(self.bert_model.config.hidden_size,
                                           self.num_classes)

        self.embedding_dropout = Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing
        self.ignore_span_metric = ignore_span_metric
        initializer(self)
Exemplo n.º 11
0
def get_modules(params_dict):
    modules = {}
    params = copy.deepcopy(params_dict)

    params["hidden_dropout_prob"] = params.pop("dropout")
    params["hidden_size"] = params.pop("embedding_size")

    # bert, roberta, electra self attentions have the same code.

    torch.manual_seed(1234)
    hf_module = BertEmbeddings(BertConfig(**params))
    modules["bert"] = hf_module

    albertparams = copy.deepcopy(params_dict)
    albertparams["hidden_dropout_prob"] = albertparams.pop("dropout")

    torch.manual_seed(1234)
    hf_module = AlbertEmbeddings(AlbertConfig(**albertparams))
    modules["albert"] = hf_module

    return modules
Exemplo n.º 12
0
def get_layer_modules(params_dict):
    modules = {}
    params = copy.deepcopy(params_dict)
    params["attention_probs_dropout_prob"] = params.pop("attention_dropout")
    params["hidden_dropout_prob"] = params.pop("hidden_dropout")

    # bert, roberta, electra, layoutlm self attentions have the same code.

    torch.manual_seed(1234)
    hf_module = BertLayer(BertConfig(**params))
    modules["bert"] = hf_module

    torch.manual_seed(1234)
    hf_module = RobertaLayer(RobertaConfig(**params))
    modules["roberta"] = hf_module

    torch.manual_seed(1234)
    hf_module = ElectraLayer(ElectraConfig(**params))
    modules["electra"] = hf_module

    return modules
Exemplo n.º 13
0
    def __init__(
        self,
        lr: float = 1.,  # see also lr scheduler
        noam_opt_warmup_steps: int = 4000,
        scheduler: str = "noam",
        scheduler_patience: int = 10,
        noam_step_factor: float = 1.,
        noam_scaler: float = 1.,
        emb_norm_reg=0.001,
        tokenizer: Tokenizer = None,
        **kwargs,
    ):

        super(BertLightningModule, self).__init__()

        devbert_config = BertConfig.from_dict({
            "attention_probs_dropout_prob": 0.05,
            "hidden_act": "gelu",
            "hidden_dropout_prob": 0.05,
            "hidden_size": 256,
            "initializer_range": 0.02,
            "intermediate_size": 1024,
            "layer_norm_eps": 1e-12,
            "max_position_embeddings": 512,
            "model_type": "bert",
            "num_attention_heads": 8,
            "num_hidden_layers": 12,
            "pad_token_id": 0,
            "type_vocab_size": 2,  # todo increase type vocab size
            "vocab_size": 30000,
        })
        self.devbert_config = devbert_config

        self.save_hyperparameters(*self.all_hyperparameters_list)

        self.bertmodel = BertForMaskedLM(config=devbert_config)

        self.tokenizer: Tokenizer = tokenizer

        return
Exemplo n.º 14
0
def main():
    FASTA_DATASET = False

    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
    # behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "fasta":
            FASTA_DATASET = True

            datasets = load_dataset_fasta_protbert(data_files,
                                                   data_args.max_seq_length)
        else:
            if extension == "txt":
                extension = "text"
            datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer,
            max_length=data_args.max_seq_length)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    # config = CONFIG_MAPPING[model_args.model_type]()

    config = BertConfig(
        vocab_size=tokenizer.vocab_size,
        # hidden_size=768,
        num_hidden_layers=1,
        # intermediate_size=3072,
        # hidden_act='gelu',
        # num_attention_heads=12,
        # hidden_dropout_prob=0.1,
        # attention_probs_dropout_prob=0.1,
        # max_position_embeddings=512,
        # type_vocab_size=2,
        # initializer_range=0.02,
        # layer_norm_eps=1e-12,
        pad_token_id=tokenizer.pad_token_id,
        # gradient_checkpointing=False
    )
    logger.warning("You are instantiating a new config instance from scratch.")

    if model_args.model_name_or_path:
        model = AutoModelForMaskedLM.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    tokenized_datasets = dict()
    for dataset_key, dataset in datasets.items():
        # Tokenize
        encodings = tokenizer(
            dataset['sequences'],
            truncation=True,
            padding='max_length',  # TODO get from args passed in
            max_length=data_args.max_seq_length,
            return_special_tokens_mask=True,
            return_token_type_ids=False,
            return_attention_mask=False)

        torch_dataset = FastaDataset(encodings)
        tokenized_datasets[dataset_key] = torch_dataset

    # Formal torch dataset objects

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=data_args.mlm_probability)

    print(model)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path if
                      (model_args.model_name_or_path is not None
                       and os.path.isdir(model_args.model_name_or_path)) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_mlm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results