def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int, num_threads: int): import torch import transformers import contexttimer import benchmark_helper torch.set_num_threads(num_threads) torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long) benchmark_helper.run_model(lambda: model(input_ids), False, n, batch_size, seq_len, "torch", num_threads)
def generate_onnx_model(model_name: str, filename: str, seq_len: int, batch_size: int, backend: str): import transformers import torch import os test_device = torch.device('cuda:0') if backend == "GPU" else torch.device( 'cpu:0') torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() model.to(test_device) cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) with open(filename, 'wb') as outf: torch.onnx.export(model=model, args=(input_ids, ), f=outf) outf.flush() return cfg.vocab_size
def generate_onnx_model(model_name: str, use_gpu: bool, filename: str, seq_len: int, batch_size: int, backend: str, use_dynamic_axes: bool = False): import transformers import torch import os test_device = torch.device( 'cuda:0') if backend == "GPU" and use_gpu else torch.device('cpu:0') torch.set_grad_enabled(False) if model_name == "bert": # use a real model to check the correctness if checkonnxrest: model = transformers.BertModel.from_pretrained("bert-base-uncased") else: cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() model.to(test_device) cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) with open(filename, 'wb') as outf: if not use_dynamic_axes: torch.onnx.export(model=model, args=(input_ids, ), f=outf) else: torch.onnx.export(model=model, args=(input_ids, ), f=outf, input_names=['input'], output_names=['output'], dynamic_axes={ 'input': [0, 1], 'output': [0, 1] }) # If not intended to make onnxruntime support variable batch size and sequence length, # you can unset the parameter `dynamic_axes`. # For some model, you have to try `opset_version=12` outf.flush() return cfg.vocab_size, cfg
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import torch import transformers import turbo_transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') cfg = None torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model, backend="turbo") elif model_name == "albert": cfg = transformers.AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) elif model_name == "distilbert": cfg = transformers.DistilBertConfig() model = transformers.DistilBertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.DistilBertModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") turbo_transformers.set_num_threads(num_threads) if enable_random: if enable_mem_opt: turbo_transformers.reset_allocator_schema("model-aware") benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "turbo", num_threads, cfg, enable_mem_opt, model_name) if enable_mem_opt: turbo_transformers.reset_allocator_schema("naive") else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "turbo", num_threads, enable_mem_opt, model_name)
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool): import torch import transformers import contexttimer import turbo_transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') if use_gpu: print("using GPU") else: print("using CPU") cfg = None torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") turbo_transformers.set_num_threads(num_threads) if enable_random: benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "turbo", num_threads, cfg) else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "turbo", num_threads)
def benchmark_torch_jit(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import transformers import contexttimer import torch.jit torch.set_num_threads(num_threads) torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long) model = torch.jit.trace(model, (input_ids, )) with torch.jit.optimized_execution(True): model(input_ids) with contexttimer.Timer() as t: for _ in range(n): model(input_ids) print( json.dumps({ "QPS": n / t.elapsed, "elapsed": t.elapsed, "n": n, "batch_size": batch_size, "seq_len": seq_len, "framework": "torch_jit", "n_threads": num_threads, "model_name": model_name }))
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int): import torch import transformers import contexttimer import turbo_transformers import benchmark_helper if not torch.cuda.is_available(): print("cuda is not available for torch") return test_device = torch.device('cuda:0') if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), True, n, batch_size, seq_len, "turbo")
def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import torch import transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') torch.set_grad_enabled(False) torch.set_num_threads(num_threads) cfg = None if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) elif model_name == "distilbert": cfg = transformers.DistilBertConfig() model = transformers.DistilBertModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() model.to(test_device) # cfg = model.config # type: transformers.BertConfig if enable_random: benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "torch", num_threads, cfg, enable_mem_opt, model_name) else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "torch", num_threads, enable_mem_opt, model_name)
def test(loadtype: LoadType, use_cuda: bool): cfg = transformers.AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) model = transformers.AlbertModel(cfg) model.eval() torch.set_grad_enabled(False) test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = model.config # use 4 threads for computing turbo_transformers.set_num_threads(4) input_ids = torch.tensor( ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]), dtype=torch.long) model.to(test_device) start_time = time.time() for _ in range(10): torch_res = model(input_ids) end_time = time.time() print("\ntorch time consum: {}".format(end_time - start_time)) # there are three ways to load pretrained model. if loadtype is LoadType.PYTORCH: # 1, from a PyTorch model, which has loaded a pretrained model tt_model = turbo_transformers.AlbertModel.from_torch(model) else: raise ("LoadType is not supported") start_time = time.time() for _ in range(10): res = tt_model(input_ids) # sequence_output, pooled_output end_time = time.time() print("\nturbo time consum: {}".format(end_time - start_time)) assert (numpy.max( numpy.abs(res[0].cpu().numpy() - torch_res[0].cpu().numpy())) < 0.1)
def train(args): logging.basicConfig(level=logging.INFO) tokenizer = transformers.AlbertTokenizer.from_pretrained( 'albert-base-v2', cache_dir=cache_dir) albert_for_math_config = transformers.AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) if args['--load']: model = transformers.AlbertForMaskedLM.from_pretrained( args['--load-from']) training_args = transformers.TrainingArguments( output_dir=args['--save-to'], overwrite_output_dir=True, num_train_epochs=int(args['--max-epoch']), per_gpu_train_batch_size=int(args['--batch-size']), per_gpu_eval_batch_size=int(args['--batch-size']), logging_steps=int(args['--log-every']), save_steps=int(args['--save-every']), save_total_limit=10, learning_rate=float(args['--lr']), seed=int(args['--seed']), ) else: model = transformers.AlbertForMaskedLM(albert_for_math_config) training_args = transformers.TrainingArguments( output_dir=args['--save-to'], num_train_epochs=int(args['--max-epoch']), per_gpu_train_batch_size=int(args['--batch-size']), per_gpu_eval_batch_size=int(args['--batch-size']), logging_steps=int(args['--log-every']), save_steps=int(args['--save-every']), save_total_limit=10, learning_rate=float(args['--lr']), seed=int(args['--seed']), ) #load datasets print('Loading Data...') train_data = torch.load( './data/train_data_train-easy_algebra__linear_1d.pt') dev_data = torch.load('./data/dev_data_train-easy_algebra__linear_1d.pt') print('Finished loading data') device = torch.device("cuda:0" if args['--cuda'] else "cpu") model.to(device) trainer = transformers.Trainer( model=model, args=training_args, data_collator=AnswerMaskDataCollator(tokenizer), train_dataset=train_data, eval_dataset=dev_data, prediction_loss_only=True, ) if args['--load']: trainer.train(model_path=args['--load-from']) else: trainer.train()
def train_without_trainer(args): logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) device = torch.device("cuda:0" if args['--cuda'] else "cpu") batch_size = int(args['--batch-size']) logging_steps = int(args['--log-every']) tokenizer = transformers.AlbertTokenizer.from_pretrained( 'albert-base-v2', cache_dir=cache_dir) albert_for_math_config = transformers.AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) print('Loading Data...') train_data = torch.load( './data/train_data_train-easy_algebra__linear_1d.pt') dev_data = torch.load('./data/dev_data_train-easy_algebra__linear_1d.pt') print('Finished loading data') data_collator = AnswerMaskDataCollator(tokenizer) train_dataloader = torch.utils.data.DataLoader( train_data, batch_size=batch_size, sampler=torch.utils.data.sampler.RandomSampler(train_data), collate_fn=data_collator.collate_batch) if args['--load']: model = transformers.AlbertForMaskedLM.from_pretrained( args['--load-from']) optimizer = get_optimizers(model, float(args['--lr'])) optimizer.load_state_dict( torch.load(os.path.join(args['--load-from'], "optimizer.pt"), map_location=device)) global_step = int(args['--load-from'].split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader)) steps_trained_in_current_epoch = global_step % len(train_dataloader) epoch = epochs_trained logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) else: model = transformers.AlbertForMaskedLM(albert_for_math_config) optimizer = get_optimizers(model, float(args['--lr'])) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 epoch = 0 model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) max_epoch = int(args['--max-epoch']) t_total = len(train_dataloader) * max_epoch tr_loss = 0.0 logging_loss = 0.0 min_eval_loss = 1e20 # might be too high valid_niter = int(args['--valid-niter']) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", max_epoch) logger.info(" train batch size = %d", batch_size) logger.info(" Total optimization steps = %d", t_total) num_eval_samples = 4096 checkpoint_prefix = 'checkpoint' while (epoch < max_epoch): epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, inputs in enumerate(epoch_iterator): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += train_step(model, inputs, device) torch.nn.utils.clip_grad_norm_(model.parameters(), float(args['--clip-grad'])) optimizer.step() model.zero_grad() global_step += 1 if global_step % logging_steps == 0: logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / logging_steps logs["lr"] = (optimizer.defaults['lr'] ) # possible RuntimeError logs["epoch"] = epoch logs["step"] = global_step logging_loss = tr_loss log(logs) if global_step % valid_niter == 0: eval_loss = 0.0 description = "Evaluation" sampler = torch.utils.data.sampler.SequentialSampler( dev_data[:num_eval_samples]) eval_dataloader = torch.utils.data.DataLoader( dev_data[:num_eval_samples], sampler=sampler, batch_size=batch_size, collate_fn=data_collator.collate_batch, ) logger.info("***** Running %s *****", description) logger.info(" Num Examples = %d", num_eval_samples) logger.info(" Batch size = %d", batch_size) for inputs in tqdm(eval_dataloader, desc=description): for k, v in inputs.items(): inputs[k] = v.to(device) model.eval() with torch.no_grad(): outputs = model(**inputs) loss = outputs[0] eval_loss += loss.item() print("\nEvaluation loss = %f" % (eval_loss / num_eval_samples)) if eval_loss / num_eval_samples * batch_size < min_eval_loss: min_eval_loss = eval_loss / num_eval_samples * batch_size # save model and optimizer output_dir = os.path.join( args['--save-to'] + '/validations/', f"{checkpoint_prefix}-{global_step}") os.makedirs(output_dir, exist_ok=True) model.save_pretrained(output_dir) output_dir = os.path.join(args['--save-to'] + '/validations/') rotate_checkpoints(output_dir) output_dir = os.path.join( args['--save-to'] + '/validations/', f"{checkpoint_prefix}-{global_step}") torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) if global_step % int(args['--save-every']) == 0: output_dir = os.path.join( args['--save-to'], f"{checkpoint_prefix}-{global_step}") os.makedirs(output_dir, exist_ok=True) model.save_pretrained(output_dir) output_dir = output_dir = os.path.join(args['--save-to']) rotate_checkpoints(output_dir) output_dir = os.path.join( args['--save-to'], f"{checkpoint_prefix}-{global_step}") torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) epoch_iterator.close() epoch += 1 logger.info( "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n" )
t_dataset = t_dataset.map(d_map, remove_columns=["text"], batched=True) # 创建针对语言模型的DataCollator t_DataCollator = DataCollatorForLanguageModeling(t_tokenizer, mlm=True, mlm_probability=0.3) # 创建Albert模型配置 albert_config = transformers.AlbertConfig( vocab_size=len(t_tokenizer), embedding_size=128, num_hidden_layers=2, num_attention_heads=4, hidden_size=256, intermediate_size=512, pad_token_id=t_tokenizer.pad_token_id, bos_token_id=t_tokenizer.bos_token_id, eos_token_id=t_tokenizer.eos_token, sep_token_id=t_tokenizer.sep_token_id) # 创建Albert语言模型 albert_model = AutoModelForMaskedLM.from_config(albert_config) # albert_model = AlbertForMaskedLM.from_pretrained("/home/hedan/tools/Github/NLP_Based_Transformer/model/checkpoint-5000") # albert_model.resize_token_embeddings(len(t_tokenizer)) # 配置训练参数 train_args = transformers.TrainingArguments(output_dir="./model", do_train=True, logging_steps=50,