def __init__(self, args, config, dataloader, ckpdir): self.device = torch.device('cuda') if ( args.gpu and torch.cuda.is_available()) else torch.device('cpu') if torch.cuda.is_available(): print('[Runner] - CUDA is available!') self.model_kept = [] self.global_step = 1 self.log = SummaryWriter(ckpdir) self.args = args self.config = config self.dataloader = dataloader self.ckpdir = ckpdir # optimizer self.learning_rate = float(config['optimizer']['learning_rate']) self.warmup_proportion = config['optimizer']['warmup_proportion'] self.gradient_accumulation_steps = config['optimizer'][ 'gradient_accumulation_steps'] self.gradient_clipping = config['optimizer']['gradient_clipping'] # Training details self.apex = config['runner']['apex'] self.total_steps = config['runner']['total_steps'] self.log_step = config['runner']['log_step'] self.save_step = config['runner']['save_step'] self.duo_feature = config['runner']['duo_feature'] self.max_keep = config['runner']['max_keep'] # Model configs self.semantic_config = RobertaConfig(**config['semantic']) self.acoustic_config = RobertaConfig(**config['acoustic'])
def __init__(self, config, num=0): super(roBerta, self).__init__() model_config = RobertaConfig() model_config.vocab_size = config.vocab_size model_config.hidden_size = config.hidden_size[0] model_config.num_attention_heads = 16 # 计算loss的方法 self.loss_method = config.loss_method self.multi_drop = config.multi_drop self.roberta = RobertaModel(model_config) if config.requires_grad: for param in self.roberta.parameters(): param.requires_grad = True self.dropout = nn.Dropout(config.hidden_dropout_prob) self.hidden_size = config.hidden_size[num] if self.loss_method in ['binary', 'focal_loss', 'ghmc']: self.classifier = nn.Linear(self.hidden_size, 1) else: self.classifier = nn.Linear(self.hidden_size, self.num_labels) self.text_linear = nn.Linear(config.embeding_size, config.hidden_size[0]) self.vocab_layer = nn.Linear(config.hidden_size[0], config.vocab_size) self.classifier.apply(self._init_weights) self.roberta.apply(self._init_weights) self.text_linear.apply(self._init_weights) self.vocab_layer.apply(self._init_weights)
def roberta_build(self, sparse=False, base_model=None, density=1.0, eval=True): if base_model == None: config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) model = RobertaForMaskedLM(config=config).cuda() else: model = base_model if sparse: mp = BlockSparseModelPatcher() mp.add_pattern( "roberta\.encoder\.layer\.[0-9]+.intermediate\.dense", {"density": density}) mp.add_pattern("roberta\.encoder\.layer\.[0-9]+.output\.dense", {"density": density}) mp.patch_model(model) if eval: model.eval() return model, model.num_parameters()
def __init__(self, data_dir: Path, tokenizer: PreTrainedTokenizer, dataset: Dataset, local_rank=-1): assert data_dir, "data_dir input needed" self.model_dir = f"{data_dir}/results" self.dataset = dataset self.config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) self.training_args = TrainingArguments( run_name=data_dir.name, local_rank=local_rank, learning_rate=0.00005, # default 0.00005 output_dir=f"{self.model_dir}", overwrite_output_dir=False, num_train_epochs=1, per_device_train_batch_size=48, # Nvidia K80 99% seed=42, save_steps=10_000, save_total_limit=1, ) self.data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
def __init__(self, cfg, device): super().__init__() tokenizer = RobertaTokenizerFast.from_pretrained('./bird_bpe_vocab', max_len=256) _config = RobertaConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=512, num_hidden_layers=4, num_attention_heads=8, max_position_embeddings=256, pad_token_id=1, eos_token_id=0, bos_token_id=2, output_attentions=False, output_hidden_states=False ) _model = RobertaForMaskedLM(_config) _model.load_state_dict(torch.load('bert_small/checkpoint-1100/pytorch_model.bin')) _model.eval() self.tokenizer = tokenizer self._model = _model self.device = device self.pad_token = 0 self.batch_size = cfg.batch_size self.proj = None if cfg.proj_lang: self.proj = nn.Sequential(*[EqualisedLinearLayer(512, cfg.latent_dim, weight_scaling=cfg.weight_scaling), nn.Tanh()])
def create_roberta_model(tokens_train, attn_mask_train, num_classes): config = RobertaConfig(vocab_size=50021, hidden_size=1024, num_hidden_layers=16, num_attention_heads=16, intermediate_size=2048, attention_probs_dropout_prob=0.3, hidden_dropout_prob=0.3) bert = TFRobertaModel(config) # dense1 = Dense(500, activation='relu') dense2 = Dense(368, activation='relu') dense3 = Dense(num_classes, activation='softmax') dropout = Dropout(0.3) tokens = Input(shape=(tokens_train.shape[1],), dtype=tf.int32) attn_mask = Input(shape=(attn_mask_train.shape[1],), dtype=tf.int32) pooled_output = bert(tokens, attn_mask).pooler_output med = dropout(dense2(pooled_output)) final = dense3(pooled_output) model = Model(inputs=[tokens, attn_mask], outputs=final) return model
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) attention_mask = None if self.use_attention_mask: attention_mask = random_attention_mask( [self.batch_size, self.seq_length]) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) config = RobertaConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, ) return config, input_ids, token_type_ids, attention_mask
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = RobertaConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def __init__( self, pretrained_model_name=None, config_filename=None, vocab_size=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", max_position_embeddings=512, ): super().__init__() # Check that only one of pretrained_model_name, config_filename, and # vocab_size was passed in total = 0 if pretrained_model_name is not None: total += 1 if config_filename is not None: total += 1 if vocab_size is not None: total += 1 if total != 1: raise ValueError( "Only one of pretrained_model_name, vocab_size, " + "or config_filename should be passed into the " + "ROBERTA constructor." ) # TK: The following code checks the same once again. if vocab_size is not None: config = RobertaConfig( vocab_size_or_config_json_file=vocab_size, vocab_size=vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings, ) model = RobertaModel(config) elif pretrained_model_name is not None: model = RobertaModel.from_pretrained(pretrained_model_name) elif config_filename is not None: config = RobertaConfig.from_json_file(config_filename) model = RobertaModel(config) else: raise ValueError( "Either pretrained_model_name or vocab_size must" + " be passed into the ROBERTA constructor" ) model.to(self._device) self.add_module("roberta", model) self.config = model.config self._hidden_size = model.config.hidden_size
def __init__(self, dropout): super(ROBERTAModel, self).__init__() self.roberta = RobertaModel.from_pretrained( config.PATHS['roberta'], config=RobertaConfig()) self.fc = nn.Linear(768, 2) self.dropout = nn.Dropout(dropout)
def train(no_cache: bool, dataset_path: str, data_config_name: str, training_args: TrainingArguments, tokenizer: RobertaTokenizerFast): print(f"tokenizer vocab size: {tokenizer.vocab_size}") print(f"\nLoading datasets found in {dataset_path}.") train_dataset, eval_dataset, test_dataset = load_dataset( 'EMBO/biolang', data_config_name, data_dir=dataset_path, split=["train", "validation", "test"], # download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS, cache_dir=CACHE) if data_config_name != "MLM": data_collator = DataCollatorForTargetedMasking( tokenizer=tokenizer, max_length=config.max_length) else: data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True) print(f"\nTraining with {len(train_dataset)} examples.") print(f"Evaluating on {len(eval_dataset)} examples.") if config.from_pretrained: model = RobertaForMaskedLM.from_pretrained(config.from_pretrained) else: model_config = RobertaConfig( vocab_size=config.vocab_size, max_position_embeddings=config.max_length + 2, # max_length + 2 for start/end token num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) model = RobertaForMaskedLM(config=model_config) training_args.remove_unused_columns = False # we need pos_mask and special_tokens_mask in collator print("\nTraining arguments:") print(training_args) trainer = MyTrainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, callbacks=[ShowExample(tokenizer)]) print(f"CUDA available: {torch.cuda.is_available()}") trainer.train() trainer.save_model(training_args.output_dir) print(f"Testing on {len(test_dataset)}.") pred: NamedTuple = trainer.predict(test_dataset, metric_key_prefix='test') print(f"{pred.metrics}")
def create_model(): config = RobertaConfig( vocab_size=3437, max_position_embeddings=64, num_attention_heads=12, num_hidden_layers=8, type_vocab_size=1, ) return RobertaForMaskedLM(config=config)
def __init__(self, classifier_config_dir, device, task_type, n_clf_layers=6, use_dm=True, use_pm=True, use_rt=True, use_bio=False, use_name=False, use_network=False, use_count=False): super(ConcatenatedClassifier, self).__init__() # load text model self.device = device self.task_type = task_type self.use_text = use_dm | use_pm | use_rt self.use_bio = use_bio self.use_name = use_name self.use_etc = use_network | use_count self.text_model = RobertaModel.from_pretrained( "vinai/bertweet-base", output_attentions=False, output_hidden_states=False) if self.use_name: self.charEmbedding = nn.Embedding( num_embeddings=302, embedding_dim=300, padding_idx=301) # 302: 300-top frequent + pad + unk self.conv3 = nn.Conv1d(in_channels=300, out_channels=256, kernel_size=3, padding=1) self.conv4 = nn.Conv1d(in_channels=300, out_channels=256, kernel_size=4, padding=1) self.conv5 = nn.Conv1d(in_channels=300, out_channels=256, kernel_size=5, padding=1) # load classifier for combining these features config = RobertaConfig() config = config.from_json_file(classifier_config_dir) config.num_hidden_layers = n_clf_layers config.num_attention_heads = n_clf_layers config.max_position_embeddings = 7 if self.use_bio: config.max_position_embeddings += 2 if self.use_name: config.max_position_embeddings += 4 self.concat_model = RobertaModel(config) self.classifier = ClassifierLayer(use_count=use_count, use_network=use_network) return
def __init__(self): super(ReviewModel, self).__init__() tokenizer = RobertaTokenizer( vocab_file = Constants.VOCAB_FILE, merges_file = Constants.MERGES_FILE, add_prefix_space = True ) config = RobertaConfig(output_hidden_states = True) self.backbone = RobertaModel(config) self.backbone.resize_token_embeddings(len(tokenizer)) self.fc = nn.Linear(in_features = config.hidden_size, out_features = 1, bias = True)
def __init__(self, num_classes, model_name) -> None: super(bertCRF, self).__init__() if model_name == "bert-base-cased-crf": self.bert = BertModel(BertConfig()) if model_name == "roberta-base-crf": self.bert = RobertaModel(RobertaConfig()) self.dropout = nn.Dropout(0.1) self.position_wise_ff = nn.Linear(768, num_classes) self.crf = CRF(num_classes)
def __init__(self, ckpt_path): super().__init__() # First reinitialize the model ckpt_states = torch.load(ckpt_path, map_location='cpu') self.acoustic_config = RobertaConfig( **ckpt_states['Settings']['Config']['acoustic']) self.semantic_config = RobertaConfig( **ckpt_states['Settings']['Config']['semantic']) acoustic_model = AcousticModel(self.acoustic_config) semantic_model = RobertaModel(self.semantic_config, add_pooling_layer=False) # load the model from pretrained states self.acoustic_model = self.load_model(acoustic_model, ckpt_states['acoustic_model'], 'acoustic.') self.semantic_model = self.load_model(semantic_model, ckpt_states['semantic_model'], 'roberta.')
def test_TFRobertaForTokenClassification(self): from transformers import RobertaConfig, TFRobertaForTokenClassification keras.backend.clear_session() # pretrained_weights = 'roberta-base' tokenizer_file = 'roberta_roberta-base.pickle' tokenizer = self._get_tokenzier(tokenizer_file) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) config = RobertaConfig() model = TFRobertaForTokenClassification(config) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def get_config(self): return RobertaConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, )
def get_config(vocab_size): if transformer_type == 'roberta': return RobertaConfig( vocab_size=vocab_size, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) return BertConfig( vocab_size=vocab_size, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, )
def get_roberta_model(): # Initializing a RoBERTa configuration configuration = RobertaConfig() # Initializing a model from the configuration Roberta_Model = RobertaModel(configuration).from_pretrained("roberta-base") Roberta_Model.to(device) # Accessing the model configuration configuration = Roberta_Model.config #get the Roberta Tokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') return Roberta_Model, tokenizer, configuration
def main(args): data = np.load(args.data, allow_pickle=True) tokenizer_path = args.tokenizer tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>") tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.convert_tokens_to_ids("</s>")), ("<s>", tokenizer.convert_tokens_to_ids("<s>")), ) config = RobertaConfig( vocab_size=tokenizer.vocab_size, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) dataset = PhoneDatasetMLM(data, tokenizer) model = RobertaForMaskedLM(config=config) training_args = TrainingArguments( output_dir=args.output_dir, overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=64, logging_steps=2, save_steps=10_000, save_total_limit=2, prediction_loss_only=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() trainer.save_model(args.output_dir)
def get_config(args): config = { "model_type": "roberta", "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.3, "hidden_size": wandb.config.hidden_size, "initializer_range": 0.02, "num_attention_heads": wandb.config.num_attention_heads, "num_hidden_layers": wandb.config.num_hidden_layers, "vocab_size": args.vocab_size, "intermediate_size": wandb.config.intermediate_size, "max_position_embeddings": 1024, "cache_dir": args.cache_dir } return RobertaConfig(**config)
def build(config): tokenizer = RobertaTokenizerFast.from_pretrained( os.path.join(config.save_directory), max_len=config.max_length ) model_config = RobertaConfig( vocab_size=config.vocab_size, max_position_embeddings=config.max_length, num_attention_heads=config.num_attention_heads, num_hidden_layers=config.num_hidden_layers, type_vocab_size=1 ) model = RobertaForMaskedLM(config=model_config) print("the number of parameters of model: ", model.num_parameters()) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=config.files, block_size=32 ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=config.mlm_probability ) training_args = TrainingArguments( output_dir=os.path.join(config.save_directory), overwrite_output_dir=config.overwrite_output_dir, num_train_epochs=config.num_train_epochs, per_gpu_train_batch_size=config.per_gpu_train_batch_size, save_steps=config.save_steps, save_total_limit=config.save_total_limit ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=config.prediction_loss_only ) return trainer
def __init__(self, args): super(Model, self).__init__() args.out_size = len(args.dense_features) self.dropout = nn.Dropout(args.hidden_dropout_prob) self.args = args #创建BERT模型,并且导入预训练模型 config = RobertaConfig.from_pretrained(args.pretrained_model_path) config.output_hidden_states = True args.hidden_size = config.hidden_size args.num_hidden_layers = config.num_hidden_layers self.text_layer = RobertaModel.from_pretrained( args.pretrained_model_path, config=config) self.text_linear = nn.Linear( args.text_dim + args.vocab_dim_v1 * len(args.text_features), args.hidden_size) logger.info("Load linear from %s", os.path.join(args.pretrained_model_path, "linear.bin")) self.text_linear.load_state_dict( torch.load(os.path.join(args.pretrained_model_path, "linear.bin"))) logger.info("Load embeddings from %s", os.path.join(args.pretrained_model_path, "embeddings.bin")) self.text_embeddings = nn.Embedding.from_pretrained(torch.load( os.path.join(args.pretrained_model_path, "embeddings.bin"))['weight'], freeze=True) args.out_size += args.hidden_size * 2 #创建Decoder模型,随机初始化 config = RobertaConfig() config.num_hidden_layers = 4 config.intermediate_size = 2048 config.hidden_size = 512 config.num_attention_heads = 16 config.vocab_size = 5 self.text_layer_1 = RobertaModel(config=config) self.text_layer_1.apply(self._init_weights) self.text_linear_1 = nn.Linear(args.text_dim_1 + args.hidden_size, 512) self.text_linear_1.apply(self._init_weights) self.norm = nn.BatchNorm1d(args.text_dim_1 + args.hidden_size) args.out_size += 1024 #创建分类器,随机初始化 self.classifier = ClassificationHead(args) self.classifier.apply(self._init_weights)
def train_MLM(vocf,outmodel,data_df): bs=8 #tokenizer=BertWordPieceTokenizer(vocf)#input vocab.txt ttk=BertTokenizer.from_pretrained(vocf)#input vocab.txt fvoc=open(vocf) vlen=len(fvoc.readlines()) fvoc.close() config=RobertaConfig(vocab_size=vlen,max_position_embeddings=12,num_attention_heads=12, \ num_hidden_layers=6,type_vocab_size=1,hidden_size=768) model=RobertaForMaskedLM(config=config) model.num_parameters() dataset=tokDataset(data_df,ttk) # Data= DataLoader(dataset, batch_size=bs,shuffle=True,drop_last=False,num_workers=0,collate_fn=collate_fn) # data_collator = DataCollatorForLanguageModeling( # tokenizer=ttk, mlm=True, mlm_probability=0.15 # ) data_collator=collate_fn( tokenizer=ttk, mlm=True, mlm_probability=0.15 ) training_args = TrainingArguments( output_dir=outmodel,#embedding model path overwrite_output_dir=True, num_train_epochs=2, per_device_train_batch_size=bs, save_steps=10_000, save_total_limit=2, ) trainer = Trainer( model=model, args=training_args, train_dataset=dataset, data_collator=data_collator, prediction_loss_only=True ) trainer.train() trainer.save_model(outmodel) print('LM train done: ')
def main(args): test_x = np.load(os.path.join(args.test_dir, "test_x.npy"), allow_pickle=True) test_y = np.load(os.path.join(args.test_dir, "test_y.npy"), allow_pickle=True) num_classes1 = len(np.unique(test_y)) if args.test2_dir is not None: test_x2 = np.load(os.path.join(args.test2_dir, "test_x.npy"), allow_pickle=True) test_y2 = np.load(os.path.join(args.test2_dir, "test_y.npy"), allow_pickle=True) test_y2 += num_classes1 test_x = np.concatenate((test_x, test_x2), axis=0) test_y = np.concatenate((test_y, test_y2), axis=0) num_classes = len(np.unique(test_y)) tokenizer_path = args.tokenizer tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>") test_dataset = PhoneRobertaDataset(test_x, test_y, tokenizer) test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) config = RobertaConfig( vocab_size=tokenizer.vocab_size, max_position_embeddings=514, num_attention_heads=args.heads, # default 12 num_hidden_layers=args.num_layers, # default 6 type_vocab_size=1, num_labels=num_classes) model = RobertaForSequenceClassification(config) device = torch.device("cuda" if torch.cuda.is_available() else 'cpu') model.load_state_dict(torch.load(args.model)) preds_all, labels_all = evaluate(model, device, test_loader) if args.test2_dir is not None: print("Evaluate on separate validation using the best model") evaluate_separate(preds_all, labels_all, num_classes1)
def main(args): # Import the custom trained tokenizer tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer) # Define the model config = RobertaConfig(vocab_size=32000) model = RobertaForMaskedLM(config=config) # Import the dataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=args.data, block_size=128, ) # Initialize the data collector data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer) # Set all of the training arguments training_args = TrainingArguments( output_dir=args.output, overwrite_output_dir=True, num_train_epochs=10, per_gpu_train_batch_size=24, save_steps=10_000, save_total_limit=10, ) # Train the model trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() # Save the mode trainer.save_model("./roBERTaCODE_{}_{}".format(args.language, args.size))
def get_plm_resources(plm, vocab_len): """load PLM resources such as model, tokenizer and config""" if plm == 'bert': bert_model = BertModel.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_config = BertConfig(vocab_size_or_config_json_file=vocab_len) elif plm == 'roberta': bert_model = RobertaModel.from_pretrained('roberta-base') tokenizer = RobertaTokenizer.from_pretrained('roberta-base') bert_config = RobertaConfig(vocab_size_or_config_json_file=vocab_len) elif plm == 'xlnet': bert_model = XLNetModel.from_pretrained('xlnet-base-cased') tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') bert_config = XLNetConfig(vocab_size_or_config_json_file=vocab_len) elif plm == 'distilbert': bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased') tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') bert_config = DistilBertConfig( vocab_size_or_config_json_file=vocab_len) return bert_model, tokenizer, bert_config
def train_mod(txt_dir, tokenizer, model_dir): config = RobertaConfig( vocab_size=3305, max_position_embeddings=1024, num_attention_heads=12, num_hidden_layers=6, output_attentions=True, type_vocab_size=1, ) dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path=txt_dir, block_size=1024) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) model = RobertaForMaskedLM(config=config) training_args = TrainingArguments( output_dir=model_dir, overwrite_output_dir=True, num_train_epochs=1000, per_gpu_train_batch_size=16, save_steps=1000, save_total_limit=37, prediction_loss_only=True, ) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=dataset) trainer.train() trainer.save_model(model_dir)
max_position_embeddings=512, num_attention_heads=12, num_hidden_layers=12, #type_vocab_size=2, default is 2 ) tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False) model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config) #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config) # 12-layer, 768-hidden, 12-heads, 110M parameters. elif args.LM == 'RoBerta': from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM config = RobertaConfig(vocab_size=50265, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=12, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False) model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config) # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture elif args.LM == 'XLM': from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel config = XLMConfig(vocab_size=64139, emb_dim=1024, max_position_embeddings=512, n_heads=8, n_layers=6, )