def get_modules(params_dict): modules = {} params = copy.deepcopy(params_dict) params["attention_probs_dropout_prob"] = params.pop("dropout") # bert, roberta, electra self attentions have the same code. torch.manual_seed(1234) hf_module = BertSelfAttention(BertConfig(**params)) modules["bert"] = hf_module torch.manual_seed(1234) hf_module = RobertaSelfAttention(RobertaConfig(**params)) modules["roberta"] = hf_module torch.manual_seed(1234) hf_module = ElectraSelfAttention(ElectraConfig(**params)) modules["electra"] = hf_module torch.manual_seed(1234) distilparams = copy.deepcopy(params_dict) distilparams["n_heads"] = distilparams.pop("num_attention_heads") distilparams["dim"] = distilparams.pop("hidden_size") distilparams["attention_dropout"] = distilparams.pop("dropout") hf_module = MultiHeadSelfAttention(DistilBertConfig(**distilparams)) modules["distilbert"] = hf_module return modules
def test_replace_embeddings(): """ Replace a model with a TFBertEmbeddings layer, which although it has extra arguments in init, it doesn't have overwritten the get_config() method. :return: None """ batch_size = 2 sequence_length = 128 config = BertConfig() def copy_weights_tf_bert_embeddings(layer, new_layer): new_layer.build((batch_size, sequence_length, new_layer.hidden_size)) new_layer.set_weights(layer.get_weights()) set_random_seeds() to_replace_dict = { TFBertEmbeddings: { "new_class": IpuTFBertEmbeddings, "new_params": { "config": config, "serialization_factor": 2 }, "copy_weights": True, "copy_weights_func": copy_weights_tf_bert_embeddings } } model_func_args = {"len_seq": sequence_length, "config": config} cfg = ipu.config.IPUConfig() cfg.device_connection.type = ipu.config.DeviceConnectionType.ON_DEMAND cfg.configure_ipu_system() strategy = ipu.ipu_strategy.IPUStrategy() with strategy.scope(): check_replace_layers(to_replace_dict, func_model_tf_bert_embeddings, model_func_args, batch_size)
def __init__(self, args, tokenizer: BertTokenizer, object_features_variant=False, positional_embed_variant=False, latent_transformer=False): super().__init__() self.args = args self.tokenizer = tokenizer self.image_projection = nn.Sequential( nn.Linear(512, 768), nn.BatchNorm1d(768, momentum=0.01)) config = BertConfig.from_pretrained('bert-base-uncased') self.tokenizer = tokenizer self.embeddings = BertEmbeddings(config) self.text_encoder = BertModel.from_pretrained("bert-base-uncased", return_dict=True) self.decoder = BertLMHeadModel.from_pretrained( 'bert-base-uncased', is_decoder=True, use_cache=True, add_cross_attention=True) if object_features_variant: self.image_transformer = ImageTransformerEncoder(args) self.positional_embed = True if positional_embed_variant else False self.latent_transformer = latent_transformer
def __init__(self, add_pooling_layer=True): config = BertConfig() super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None self.init_weights()
def get_attention_modules(): params = copy.deepcopy(ATTENTION_PARAMS_DICT) params["attention_probs_dropout_prob"] = params.pop("attention_dropout") params["hidden_dropout_prob"] = params.pop("hidden_dropout") torch.manual_seed(1234) yield "bert", BertAttention(BertConfig(**params)).eval() torch.manual_seed(1234) yield "roberta", RobertaAttention(RobertaConfig(**params)).eval() torch.manual_seed(1234) yield "electra", ElectraAttention(ElectraConfig(**params)).eval()
def get_layer_modules(): params = copy.deepcopy(LAYER_PARAMS_DICT) params["attention_probs_dropout_prob"] = params.pop("attention_dropout") params["hidden_dropout_prob"] = params.pop("hidden_dropout") params["hidden_act"] = params.pop("activation") torch.manual_seed(1234) yield "bert", BertLayer(BertConfig(**params)).eval() torch.manual_seed(1234) yield "roberta", RobertaLayer(RobertaConfig(**params)).eval() torch.manual_seed(1234) yield "electra", ElectraLayer(ElectraConfig(**params)).eval()
def get_modules(): params = copy.deepcopy(PARAMS_DICT) params["hidden_dropout_prob"] = params.pop("dropout") params["hidden_size"] = params.pop("embedding_size") # bert, roberta, electra self attentions have the same code. torch.manual_seed(1234) yield "bert", BertEmbeddings(BertConfig(**params)) albertparams = copy.deepcopy(PARAMS_DICT) albertparams["hidden_dropout_prob"] = albertparams.pop("dropout") torch.manual_seed(1234) yield "albert", AlbertEmbeddings(AlbertConfig(**albertparams))
def test_replace_lm_prediction_head(): """ Replace a model with a TFBertLMPredictionHead layer. :return: None """ batch_size = 2 sequence_length = 128 config = BertConfig() cfg = ipu.config.IPUConfig() cfg.device_connection.type = ipu.config.DeviceConnectionType.ON_DEMAND cfg.configure_ipu_system() strategy = ipu.ipu_strategy.IPUStrategy() with strategy.scope(): model = get_bert_lm_prediction_head_model(config, TFBertLMPredictionHead) def copy_weights_tf_bert_lm_prediction_head(layer, new_layer): new_layer.build( (batch_size, sequence_length, new_layer.hidden_size)) new_layer.set_weights(layer.get_weights()) set_random_seeds() to_replace_dict = { TFBertLMPredictionHead: { "new_class": IpuTFBertLMPredictionHead, "new_params": { "config": config, "input_embeddings": lambda: model.embedding, "serialization_factor": 2, }, "copy_weights": True, "copy_weights_func": copy_weights_tf_bert_lm_prediction_head }, } model_func_args = { "len_seq": sequence_length, "config": config, "model": model } check_replace_layers(to_replace_dict, func_model_tf_bert_lm_prediction_head, model_func_args, batch_size)
def get_modules(params_dict): modules = {} params = copy.deepcopy(params_dict) params["attention_probs_dropout_prob"] = params.pop("attention_dropout") params["hidden_dropout_prob"] = params.pop("hidden_dropout") torch.manual_seed(1234) hf_module = BertEncoder(BertConfig(**params)) modules["bert"] = hf_module torch.manual_seed(1234) hf_module = RobertaEncoder(RobertaConfig(**params)) modules["roberta"] = hf_module torch.manual_seed(1234) hf_module = ElectraEncoder(ElectraConfig(**params)) modules["electra"] = hf_module return modules
def __init__( self, vocab: Vocabulary, bert_model: Union[str, Dict[str, Any], BertModel], embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH, **kwargs, ) -> None: super().__init__(vocab, **kwargs) if isinstance(bert_model, str): self.bert_model = BertModel.from_pretrained(bert_model) elif isinstance(bert_model, dict): warnings.warn( "Initializing BertModel without pretrained weights. This is fine if you're loading " "from an AllenNLP archive, but not if you're training.", UserWarning, ) bert_config = BertConfig.from_dict(bert_model) self.bert_model = BertModel(bert_config) else: self.bert_model = bert_model self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.tag_projection_layer = Linear(self.bert_model.config.hidden_size, self.num_classes) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric initializer(self)
def get_modules(params_dict): modules = {} params = copy.deepcopy(params_dict) params["hidden_dropout_prob"] = params.pop("dropout") params["hidden_size"] = params.pop("embedding_size") # bert, roberta, electra self attentions have the same code. torch.manual_seed(1234) hf_module = BertEmbeddings(BertConfig(**params)) modules["bert"] = hf_module albertparams = copy.deepcopy(params_dict) albertparams["hidden_dropout_prob"] = albertparams.pop("dropout") torch.manual_seed(1234) hf_module = AlbertEmbeddings(AlbertConfig(**albertparams)) modules["albert"] = hf_module return modules
def get_layer_modules(params_dict): modules = {} params = copy.deepcopy(params_dict) params["attention_probs_dropout_prob"] = params.pop("attention_dropout") params["hidden_dropout_prob"] = params.pop("hidden_dropout") # bert, roberta, electra, layoutlm self attentions have the same code. torch.manual_seed(1234) hf_module = BertLayer(BertConfig(**params)) modules["bert"] = hf_module torch.manual_seed(1234) hf_module = RobertaLayer(RobertaConfig(**params)) modules["roberta"] = hf_module torch.manual_seed(1234) hf_module = ElectraLayer(ElectraConfig(**params)) modules["electra"] = hf_module return modules
def __init__( self, lr: float = 1., # see also lr scheduler noam_opt_warmup_steps: int = 4000, scheduler: str = "noam", scheduler_patience: int = 10, noam_step_factor: float = 1., noam_scaler: float = 1., emb_norm_reg=0.001, tokenizer: Tokenizer = None, **kwargs, ): super(BertLightningModule, self).__init__() devbert_config = BertConfig.from_dict({ "attention_probs_dropout_prob": 0.05, "hidden_act": "gelu", "hidden_dropout_prob": 0.05, "hidden_size": 256, "initializer_range": 0.02, "intermediate_size": 1024, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 8, "num_hidden_layers": 12, "pad_token_id": 0, "type_vocab_size": 2, # todo increase type vocab size "vocab_size": 30000, }) self.devbert_config = devbert_config self.save_hyperparameters(*self.all_hyperparameters_list) self.bertmodel = BertForMaskedLM(config=devbert_config) self.tokenizer: Tokenizer = tokenizer return
def main(): FASTA_DATASET = False # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub # # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this # behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "fasta": FASTA_DATASET = True datasets = load_dataset_fasta_protbert(data_files, data_args.max_seq_length) else: if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, max_length=data_args.max_seq_length) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) # config = CONFIG_MAPPING[model_args.model_type]() config = BertConfig( vocab_size=tokenizer.vocab_size, # hidden_size=768, num_hidden_layers=1, # intermediate_size=3072, # hidden_act='gelu', # num_attention_heads=12, # hidden_dropout_prob=0.1, # attention_probs_dropout_prob=0.1, # max_position_embeddings=512, # type_vocab_size=2, # initializer_range=0.02, # layer_norm_eps=1e-12, pad_token_id=tokenizer.pad_token_id, # gradient_checkpointing=False ) logger.warning("You are instantiating a new config instance from scratch.") if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. tokenized_datasets = dict() for dataset_key, dataset in datasets.items(): # Tokenize encodings = tokenizer( dataset['sequences'], truncation=True, padding='max_length', # TODO get from args passed in max_length=data_args.max_seq_length, return_special_tokens_mask=True, return_token_type_ids=False, return_attention_mask=False) torch_dataset = FastaDataset(encodings) tokenized_datasets[dataset_key] = torch_dataset # Formal torch dataset objects # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=data_args.mlm_probability) print(model) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results