def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = OpenAIGPTConfig( vocab_size=self.vocab_size, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, # intermediate_size=self.intermediate_size, # hidden_act=self.hidden_act, # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, n_ctx=self.max_position_embeddings # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): # Construct model if openai_config_file == "": config = OpenAIGPTConfig() else: config = OpenAIGPTConfig.from_json_file(openai_config_file) model = OpenAIGPTModel(config) # Load weights from numpy load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string())
def test_TFOpenAIGPTModel(self): from transformers import OpenAIGPTConfig, TFOpenAIGPTModel keras.backend.clear_session() # pretrained_weights = 'openai-gpt' tokenizer_file = 'openai_openai-gpt.pickle' tokenizer = self._get_tokenzier(tokenizer_file) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) config = OpenAIGPTConfig() model = TFOpenAIGPTModel(config) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def test_TFOpenAIGPTDoubleHeadsModel(self): from transformers import OpenAIGPTConfig, TFOpenAIGPTDoubleHeadsModel keras.backend.clear_session() # pretrained_weights = 'openai-gpt' tokenizer_file = 'openai_openai-gpt.pickle' tokenizer = self._get_tokenzier(tokenizer_file) # tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2), batch_dims = 1 in this case text, inputs, inputs_onnx = self._prepare_inputs(tokenizer, batch_size=1) config = OpenAIGPTConfig() model = TFOpenAIGPTDoubleHeadsModel(config) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
def __init__(self, config): super(ZHENG, self).__init__() self.config = config gpt_config = OpenAIGPTConfig().from_pretrained('openai-gpt') setattr(gpt_config, 'users', config.users) setattr(gpt_config, 'user_size', config.user_size) if config.pretrained: transformer = TransformerModule(gpt_config).from_pretrained('openai-gpt', config=gpt_config) else: transformer = TransformerModule(gpt_config) # for DataParallel model_to_resize = transformer.module if hasattr(transformer, "module") else transformer model_to_resize.resize_token_embeddings(config.vocab_size) self.transformer = model_to_resize self.linear = nn.Linear(config.embedding_size, config.vocab_size, bias=False) # tie weights self.linear.weight = self.transformer.tokens_embed.weight
import torch import torch.nn as nn # import a config from transformers from transformers import Trainer, TrainingArguments from transformers import TextDataset # OpenAI GPT for text generation from transformers import OpenAIGPTConfig, OpenAIGPTTokenizer, OpenAIGPTLMHeadModel from transformers import DataCollatorForLanguageModeling from process_data import * # initialize a model from config config = OpenAIGPTConfig(vocab_size=100000, n_positions=512, n_layer=6) model = False # the pretrained tokenizer tname = "Jojo_Tokenizer" tokenizer = OpenAIGPTTokenizer.from_pretrained(tname) # initialize a data collator # https://github.com/huggingface/transformers/blob/1af58c07064d8f4580909527a8f18de226b226ee/src/transformers/data/data_collator.py#L68 data_collator = False # initialize dataset - process_data # https://github.com/huggingface/transformers/blob/1af58c07064d8f4580909527a8f18de226b226ee/src/transformers/data/datasets/language_modeling.py#L16 dataset = False output = "output" # initialize training arguments training_args = TrainingArguments( output_dir="./" + output,
Input = Input.long().to(self.device) Output = self.Trans(Input,attention_mask=attn_mask) logits = Output[0] labels = Input shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() flatten_shift_loss_mask = loss_mask[..., :-1].contiguous().view(-1) ids = nonzero(flatten_shift_loss_mask).view(-1) fin_logits,fin_labels = shift_logits.view(-1, shift_logits.size(-1))[ids], shift_labels.view(-1)[ids] return fin_logits,fin_labels def decode(self,Input,Label,max_length): """decode the given input probabilities with search probabilities""" attn_mask = tensor(Label.clone().detach() == 1.0,dtype=uint8,device=self.device) Input = Input.long().to(self.device) Output = self.Trans.generate(Input,attention_mask=attn_mask,max_length=max_length) return Output if __name__ == "__main__": Trans_Config = OpenAIGPTConfig(vocab_size=3002,n_layer=12) Trans_Model = OpenAIGPTLMHeadModel(Trans_Config,) Token_Dir = r"G:\Work Related\Nlc2cmd\Tokenizer_Train\GPTToken/" Trans_Tok = GPT2TokenizerFast.from_pretrained(Token_Dir) Omni = OmniBash(Trans_Model,"cpu") Dataset = OmnibashDataset(r"G:\Work Related\Nlc2cmd\Data\Template.json",Trans_Tok,"train",100) TrainLoader = DataLoader(Dataset,batch_size=10) Sample = next(iter(TrainLoader)) X = Omni.decode(Sample[0][0].unsqueeze(0),Sample[1][0].unsqueeze(0)) print(X) Out = Trans_Tok.convert_ids_to_tokens(X[0]) print(Out)
from transformers import XLNetLMHeadModel, XLNetTokenizer from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer from transformers import CTRLLMHeadModel, CTRLTokenizer from transformers import XLMWithLMHeadModel, XLMTokenizer logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop ALL_MODELS = sum( (conf for conf in (GPT2Config.get_config_dict("gpt2"), OpenAIGPTConfig.get_config_dict("openai-gpt"))), ()) MODEL_CLASSES = { 'gpt2': (GPT2LMHeadModel, GPT2Tokenizer), 'ctrl': (CTRLLMHeadModel, CTRLTokenizer), 'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), 'xlnet': (XLNetLMHeadModel, XLNetTokenizer), 'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer), 'xlm': (XLMWithLMHeadModel, XLMTokenizer), } # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia # in https://github.com/rusiaaman/XLNet-gen#methodology # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered.
'batch_size': 64, 'tenacity': 5, 'epoch_size': 4 } # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--model', default='openai-gpt', help='model name or path') args = parser.parse_args() config = OpenAIGPTConfig.from_pretrained(args.model) model = OpenAIGPTModel.from_pretrained(args.model, config=config) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model) special_tokens_dict = {'pad_token': '<pad>'} tokenizer.add_special_tokens(special_tokens_dict=special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) params_senteval['model'] = model.cuda().eval() params_senteval['tokenizer'] = tokenizer se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [ 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment',
TransfoXLTokenizer, ) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop # ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ()) OUTPUT = os.getenv('OUTPUT') ALL_MODELS = sum((conf for conf in (GPT2Config.get_config_dict(OUTPUT), OpenAIGPTConfig.get_config_dict(OUTPUT))), ()) MODEL_CLASSES = { 'gpt2': (GPT2LMHeadModel, GPT2Tokenizer), 'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), 'xlnet': (XLNetLMHeadModel, XLNetTokenizer), 'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer), } # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia # in https://github.com/rusiaaman/XLNet-gen#methodology # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western Siberia,
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) config = OpenAIGPTConfig.from_pretrained(args.model_checkpoint, vocab_size=40483) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel #model = model_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint, config=config) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()