def __init__( self, config, class_labels, pretrained_model_path, dropout=0.1, freeze_pretrained_part=True, reinitialize=False, n_layers=6, ): super().__init__(config, class_labels) if reinitialize: logger.info('resetting model weights') config = GPT2Config.from_json_file(pretrained_model_path + '/config.json') config = config.to_dict() config['n_layer'] = n_layers config = GPT2Config.from_dict(config) self.gpt2 = GPT2Model(config) else: self.gpt2 = GPT2Model.from_pretrained(pretrained_model_path) self.dropout = torch.nn.Dropout(dropout) self.fc = torch.nn.Linear(self.gpt2.config.n_embd, self.output_dim) if freeze_pretrained_part: for param in self.gpt2.parameters(): param.requires_grad = False
def load_model(target_folder, config): # Parse parameters model_size = config.get('model', 'model_size') no_cuda = config.getboolean('model', 'no_cuda') logger.info("Loading the model...") device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") # Tokenizer tokenizer = GPT2Tokenizer(os.path.join(target_folder, 'vocab.json'), os.path.join(target_folder, 'merges.txt')) # Config config = GPT2Config.from_json_file( os.path.join(target_folder, 'config.json')) # Weights state_dict_path = glob(os.path.join(target_folder, f'*.pkl'))[0] state_dict = torch.load(state_dict_path, map_location=device) if model_size == 'small': for key in list(state_dict.keys()): state_dict[key.replace('module.', '')] = state_dict.pop(key) state_dict['lm_head.weight'] = state_dict['lm_head.decoder.weight'] state_dict.pop("lm_head.decoder.weight", None) # Model model = GPT2LMHeadModel(config) model.load_state_dict(state_dict) model.to(device) model.eval() return model, tokenizer
def __init__( self, batch_size, epochs, t_total=100000, config_path="config/model_config.json", data_path="data/train.json", valid_examples=100, vocab_path="vocab/vocab.txt", max_length=1024, warm_up_steps=0, lr=1e-4, ): super(Net, self).__init__() self.batch_size = batch_size self.epochs = epochs self.t_total = t_total self.warm_up_steps = warm_up_steps self.lr = lr self.model_name = "bert_pretrained_model" self.config = GPT2Config.from_json_file(config_path) self.model = GPT2LMHeadModel(config=self.config) self.data = [json.loads(line.strip()) for line in open(data_path)] self.dataset_train = DS(self.data[:-valid_examples], vocab_path=vocab_path, max_length=max_length) self.dataset_valid = DS(self.data[-valid_examples:], vocab_path=vocab_path, max_length=max_length)
def build_model(self): """创建GPT-2生成模型 """ # 使用bert tokenizer # 初始化tokenizer self.tokenizer = BertTokenizer(vocab_file=self.args.vocab_path) # temp = self.tokenizer.convert_tokens_to_ids('') # print(self.tokenizer.convert_ids_to_tokens(temp)) # tokenizer的字典大小 self.vocab_size = len(self.tokenizer) self.pad_id = self.tokenizer.convert_tokens_to_ids(PAD) if self.args.pretrained_model: # 如果指定了预训练的GPT2模型 model = GPT2LMHeadModel.from_pretrained(self.args.pretrained_model) else: # 若没有指定预训练模型,则初始化模型 model_config = GPT2Config.from_json_file(self.args.model_config) # 将一些特殊字符id都设置为0 if model_config.eos_token_id != 0: model_config.eos_token_id = 0 if model_config.bos_token_id != 0: model_config.bos_token_id = 0 if model_config.pad_token_id != 0: model_config.pad_token_id = 0 model = GPT2LMHeadModel(config=model_config) # 根据tokenizer的vocabulary调整GPT2模型的voca的大小 model.resize_token_embeddings(self.vocab_size) print('model config:\n{}'.format(model.config.to_json_string())) return model, model.config.to_dict().get("n_ctx")
def __init__(self): super().__init__() self.tokenizer = BertTokenizer(vocab_file=FLAGS.vocab_path) self.config = GPT2Config.from_json_file(FLAGS.model_config) self.model = GPT2LMHeadModel(config=self.config)
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, full, gpt2_config_file, pytorch_dump_folder_path): #putting requirements here so users can see usage info before it errors out on missing modules from io import open from shutil import copyfile import logging logging.basicConfig(level=logging.INFO) from pathlib import Path import torch #WEIGHTS_NAME = "pytorch_model.bin" #CONFIG_NAME = "config.json" from transformers import ( CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2, ) gpt2_checkpoint_path = Path(gpt2_checkpoint_path) print(gpt2_checkpoint_path.name) if pytorch_dump_folder_path == '': prefix = '32BIT-' if full else '16BIT-' pytorch_dump_folder_path = 'pytorch-' + prefix + gpt2_checkpoint_path.name pytorch_dump_folder_path = Path(pytorch_dump_folder_path) pytorch_dump_folder_path.mkdir(exist_ok=True) # Construct model if gpt2_config_file == "": #This doesn't seem to work. We will use the hparams.json file that seems to be included in #config = GPT2Config() gpt2_config_file = gpt2_checkpoint_path / 'hparams.json' config = GPT2Config.from_json_file(gpt2_config_file) model = GPT2Model(config) # Load weights from numpy load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) if not full: model.half() # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path / WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path / CONFIG_NAME print("Save PyTorch model to {}".format(str(pytorch_weights_dump_path))) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to: " + str(pytorch_config_dump_path)) with pytorch_config_dump_path.open("w", encoding="utf-8") as f: f.write(config.to_json_string()) copyfile(gpt2_checkpoint_path / 'vocab.bpe', pytorch_dump_folder_path / 'merges.txt') copyfile(gpt2_checkpoint_path / 'encoder.json', pytorch_dump_folder_path / 'vocab.json')
def create_model(pre_trained=False): if pre_trained: model = GPT2LMHeadModel.from_pretrained(config.MODEL_PATH) else: model_config = GPT2Config.from_json_file(config.CONFIG_JSON_FILE) model = GPT2LMHeadModel(config=model_config) # model.resize_token_embeddings(vocab_size) n_ctx = model.config.to_dict().get("n_ctx") return model, n_ctx
def main(): # 设置模型训练参数 args = set_args() # 设置cuda信息 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICE"] = args.device # 获取device信息,用于模型训练 device = torch.device( "cuda" if torch.cuda.is_available() and int(args.device) >= 0 else "cpu" ) # 设置随机种子 if args.seed: torch.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) # 加载模型的config model_config = GPT2Config.from_json_file(args.config_path) if args.pretrained_model_path: model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path) else: # 如果没有指定的预训练模型,则初始化模型 model = GPT2LMHeadModel(config=model_config) # 实例化tokenizer tokenizer = BertTokenizer.from_pretrained(args.vocab_path, do_lower_case=True) # 将[space]作为一个分割整体,例如:"我爱[Space]中国。",使用原始tokenizer分词结果为"['我', '爱', '[', 'Space', ']', '中', '国', '。']"; # 增加分割符号后的结果为"['我', '爱', '[Space]', '中', '国', '。']" tokenizer.add_tokens("[Space]", special_tokens=True) # 创建模型的输出目录 if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # 加载训练数据和测试数据 train_data = GPT2NewsTitleDataSet( tokenizer, args.max_len, args.title_max_len, args.data_dir, "train", args.train_file_path, ) test_data = GPT2NewsTitleDataSet( tokenizer, args.max_len, args.title_max_len, args.data_dir, "test", args.test_file_path, ) # 开始训练 train(model, device, train_data, test_data, args)
def load_pretrained_model(args): if args.pretrained_model: logger.info(f'loading pretrained model from {args.pretrained_model}') model = GPT2LMHeadModel.from_pretrained(args.pretrained_model) else: logger.info('init pretrained model...') config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config) return model, model.config.to_dict().get("n_ctx")
def main(): # Config config = InferenceConfig() gpt_config = GPT2Config.from_json_file(config.model_config_path) # torch related torch.set_grad_enabled(False) torch.manual_seed(config.random_seed) # Logger logger = logging.getLogger() logger.setLevel(logging.DEBUG) handler = StreamHandler(sys.stdout) handler.setFormatter(logging.Formatter("[%(asctime)s] %(message)s")) logger.addHandler(handler) # Text Utils logging.info(f"loading Tokenizer...") tokenizer = GPT2Tokenizer(config.tokenizer_vocab_path, config.tokenizer_merge_path) # Forward Model logging.info(f"loading Forward Model...") forward_model = GPT2LMHeadModel(gpt_config) forward_model.load_state_dict( load_model_weight(gpt_config, config.forward_model_path)) # Backward Model logging.info(f"loading Backward Model...") backward_model = GPT2LMHeadModel(gpt_config) backward_model.load_state_dict( load_model_weight(gpt_config, config.backward_model_path)) # Example example_contexts = [ "<|endoftext|>".join(["How are you doing?"]), "<|endoftext|>".join(["Does money buy happiness?"]), "<|endoftext|>".join([ "Does money buy happiness?", "Depends how much money you spend on it .", ]), "<|endoftext|>".join([ "Does money buy happiness?", "Depends how much money you spend on it .", "What is the best way to buy happiness ?", ]), ] inferencer = Inferencer(config, tokenizer, forward_model, backward_model) results = inferencer.run(example_contexts) for context, results in zip(example_contexts, results): logging.info(f"Example Context:{context}") for i, reply in enumerate(results): logging.info(f"Output Utterance Top-{i+1}: {reply}")
def __init__(self): super(ConditionalGenerationModel, self).__init__() if args.base_model.endswith(".json"): # 使用config model_config = GPT2Config.from_json_file(args.base_model) self.base_model = GPT2LMHeadModel(config=model_config) else: # 载预训练模型 self.base_model = GPT2LMHeadModel.from_pretrained(args.base_model) self.base_model.resize_token_embeddings(len(args.tokenizer)) self.config = self.base_model.config
def main(): # 初始化参数 args = set_args() # 设置使用哪些显卡进行训练 os.environ["CUDA_VISIBLE_DEVICES"] = args.device args.cuda = not args.no_cuda # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda:0' if args.cuda else 'cpu' args.device = device logger.info('using device:{}'.format(device)) # 初始化tokenizer tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_model) args.sep_id = tokenizer.sep_token_id args.pad_id = tokenizer.pad_token_id args.cls_id = tokenizer.cls_token_id # 创建模型的输出目录 if not os.path.exists(args.save_model_path): os.mkdir(args.save_model_path) # 创建模型 if args.pretrained_model: # 加载预训练模型 model = GPT2LMHeadModel.from_pretrained(args.pretrained_model) else: # 初始化模型 model_config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config=model_config) model = model.to(device) logger.info('model config:\n{}'.format(model.config.to_json_string())) assert model.config.vocab_size == tokenizer.vocab_size # 并行训练模型 if args.cuda and torch.cuda.device_count() > 1: model = DataParallel(model).cuda() logger.info("use GPU {} to train".format(args.device)) # 计算模型参数数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() logger.info('number of model parameters: {}'.format(num_parameters)) # 记录参数设置 logger.info("args:{}".format(args)) # 加载训练集和验证集 # ========= Loading Dataset ========= # train_dataset, validate_dataset = load_dataset(args) train(tokenizer, model, train_dataset, validate_dataset, args)
def __init__(self, args, pretrained, model_checkpoint, report_every, ren, norm_fn, device, logdir=None): self.args = args self._ren = ren self._device = device self.tokenizer = GPT2Tokenizer.from_pretrained( model_checkpoint, do_lower_case=True) self.pad_id = self.tokenizer.eos_token_id self.use_segments = True self._config = GPT2Config.from_json_file(os.path.join(model_checkpoint, CONFIG_NAME)) self._max_len = 256 # 512 # self._config.n_ctx self._model = GPT2LMHeadModel.from_pretrained( model_checkpoint).to(device) if pretrained else GPT2LMHeadModel(self._config).to(device) num_param, _, __ = _tally_parameters(self._model) logger.info("model paramerters: {}".format(num_param)) if not os.path.exists("checkpoints"): os.mkdir("checkpoints") self.save_dir = os.path.join("checkpoints", args.save_dir) if not os.path.exists(self.save_dir): os.mkdir(self.save_dir) elif args.infer_from == "": if SYS != "Windows": raise Exception("path exists {}".format(self.save_dir)) self._optimizer = None self.writer = SummaryWriter(logdir=logdir) self.report_every = report_every self.batch_step = 0 self.training_step = 1 self.gradient_accumulation_steps = args.gradient_accumulation_steps self.max_val_step = args.max_val_step self._dataset = {} self._data_loader = {} self._weights = None self._w_decay = None if norm_fn == 'linear': self._norm_fn = _linear_normalize elif norm_fn == 'softmax': self._norm_fn = _softmax_normalize if ren: assert norm_fn == 'linear'
def build_model(args): if args.pretrained_path == '': config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config) tokenizer = BertTokenizerFast(args.vocab) # XXX: must add this, or can't tokenize special token in string to single char tokenizer.sanitize_special_tokens() info = None else: config = GPT2Config.from_pretrained(args.pretrained_path) model, info = GPT2LMHeadModel.from_pretrained(args.pretrained_path, config=config, output_loading_info=True) tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_path) return model, tokenizer, info
def __init__(self): super().__init__() self.config = GPT2Config.from_json_file(FLAGS.model_config) self.tokenizer = BertTokenizer(vocab_file=FLAGS.vocab_path) if FLAGS.train_mmi and FLAGS.use_pretrain: self.model = GPT2LMHeadModel.from_pretrained(FLAGS.mmi_model_path, config=self.config) elif not FLAGS.train_mmi and FLAGS.use_pretrain: self.model = GPT2LMHeadModel.from_pretrained( FLAGS.dialogue_model_path, config=self.config) else: self.model = GPT2LMHeadModel(config=self.config) self.n_ctx = self.model.config.to_dict().get("n_ctx") self.pad_id = self.tokenizer.convert_tokens_to_ids("[PAD]")
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): # Construct model if gpt2_config_file == "": config = GPT2Config() else: config = GPT2Config.from_json_file(gpt2_config_file) model = GPT2Model(config) # Load weights from numpy load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string())
def run_pplm_example( pretrained_model="gpt2-medium", #预训练好的LM cond_text="", #起始词 uncond=False, num_samples=1, #生成样本数 bag_of_words=None, #使用的BOW词典,主题 discrim=None, #是否使用判别式属性模型 discrim_weights=None, discrim_meta=None, class_label=-1, length=100, #生成的长度 stepsize=0.02, temperature=1.0, top_k=10, sample=False, #是否采用抽样 num_iterations=3, grad_length=10000, horizon_length=1, window_length=0, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, no_cuda=False, colorama=False, #彩色高亮显示 repetition_penalty=1.0, #重复单词惩罚性 ): # set Random seed torch.manual_seed(seed) np.random.seed(seed) # set the device device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" #加判别式模型的参数 if discrim == "generic": set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim]["pretrained_model"] print("discrim = {}, pretrained_model set to discriminator's = {}".format(discrim, pretrained_model)) config = GPT2Config.from_json_file('./gpt2sanwen/config.json')
def load_model(target_folder_name, config): # Parse parameters data_folder = config.get('model', 'data_folder') model_size = config.get('model', 'model_size') no_cuda = config.getboolean('model', 'no_cuda') logger.info(f"Loading model from {target_folder_name}...") print(1) device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") print(2) # Tokenizer target_folder = os.path.join(data_folder, target_folder_name) tokenizer = GPT2Tokenizer(os.path.join(target_folder, 'vocab.json'), os.path.join(target_folder, 'merges.txt')) print(3) # Config config = GPT2Config.from_json_file( os.path.join(target_folder, 'config.json')) print(4) # Weights torch.cuda.set_device(0) state_dict_path = glob(os.path.join(target_folder, f'*.pkl'))[0] state_dict = torch.load(state_dict_path, map_location=device) print(5) if model_size == 'small': for key in list(state_dict.keys()): state_dict[key.replace('module.', '')] = state_dict.pop(key) state_dict['lm_head.weight'] = state_dict['lm_head.decoder.weight'] state_dict.pop("lm_head.decoder.weight", None) # Model print(6) model = GPT2LMHeadModel(config) print(7) model.load_state_dict(state_dict) print(8) model.to(device) print(9) model.eval() print(10) return model, tokenizer
def create_model(args, vocab_size): """ :param args: :param vocab_size:字典大小 :return: """ if args.pretrained_model: # 如果指定了预训练的GPT2模型,默认没有 model = GPT2LMHeadModel.from_pretrained(args.pretrained_model) # elif os.path.exists(args.best_model): # 若已经进行过训练,则在原训练基础上继续训练 # model = GPT2LMHeadModel.from_pretrained(args.best_model) elif os.path.exists( args.dialogue_model_output_path): # 若已经进行过训练,则在原训练基础上继续训练 model = GPT2LMHeadModel.from_pretrained( args.dialogue_model_output_path + "model_epoch10/") else: # 若没有指定预训练模型,则初始化模型 model_config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config=model_config) # 根据tokenizer的vocabulary调整GPT2模型的voca的大小 model.resize_token_embeddings(vocab_size) logger.info('model config:\n{}'.format(model.config.to_json_string())) return model, model.config.to_dict().get("n_ctx")
def main(): # 设置模型训练参数 args = set_args() # 设置随机种子,方便模型复现 if args.seed: torch.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) # 加载模型的config model_config = GPT2Config.from_json_file(args.config_path) # 实例化GPT2LMHeadModel模型,这里我们没有加载预训练好的模型,而是直接从头开始训练。 if args.pretrained_model_path: model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path) else: # 如果没有指定的预训练模型,则初始化模型 model = GPT2LMHeadModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.vocab_path, do_lower_case=True) # 将[space]作为一个分割整体,例如:"我爱[Space]中国。",使用原始tokenizer分词结果为"['我', '爱', '[', 'Space', ']', '中', '国', '。']"; # 增加分割符号后的结果为"['我', '爱', '[Space]', '中', '国', '。']" tokenizer.add_tokens("[Space]", special_tokens=True) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # 加载训练数据和测试数据 train_data = GPT2NewsTitleDataSet(tokenizer, args.max_len, args.title_max_len, args.data_dir, "train", args.train_file_path) test_data = GPT2NewsTitleDataSet(tokenizer, args.max_len, args.title_max_len, args.data_dir, "test", args.test_file_path) # 开始训练 train(model, train_data, test_data, args)
def create_model(args, vocab_size): """ :param args: :param vocab_size:字典大小 :return: """ print('配置模型参数') # model_config = GPT2Config.from_json_file('config/model_config_dialogue_small.json') print(vocab_size) print('创建model') # model = TFGPT2LMHeadModel.from_pretrained('gpt2') if args.pretrained_model: # 如果指定了预训练的GPT2模型 model = TFGPT2LMHeadModel.from_pretrained(args.pretrained_model) else: # 若没有指定预训练模型,则初始化模型 print('初始化模型') model_config = GPT2Config.from_json_file(args.model_config) print('config:\n' + model_config.to_json_string()) model = TFGPT2LMHeadModel(config=model_config) print('构造好模型') # 根据tokenizer的vocabulary调整GPT2模型的voca的大小 #model.resize_token_embeddings(vocab_size) # model = TFGPT2LMHeadModel.from_pretrained()#实例化一个类 return model, model.config.to_dict().get("n_ctx")
import torch import torch.nn.functional as F from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config from config import device_f, device_r, num_samples, MMI_temperature, top_k torch.set_grad_enabled(False) tokenizer = GPT2Tokenizer('medium/vocab.json', 'medium/merges.txt') weights = torch.load('medium/medium_ft.pkl') # fix misused key value weights["lm_head.weight"] = weights["lm_head.decoder.weight"] weights.pop("lm_head.decoder.weight", None) cfg = GPT2Config.from_json_file('medium/config.json') model: GPT2LMHeadModel = GPT2LMHeadModel(cfg) model.load_state_dict(weights) if device_f == 'cuda': model.half() model.to(device_f) model.eval() weights = torch.load('medium/small_reverse.pkl') # fix misused key value weights["lm_head.weight"] = weights["lm_head.decoder.weight"] weights.pop("lm_head.decoder.weight", None) reverse_model: GPT2LMHeadModel = GPT2LMHeadModel(cfg) reverse_model.load_state_dict(weights) if device_r == 'cuda': reverse_model.half()
def main(): # Create the argument parser. parser = argparse.ArgumentParser() parser.add_argument("--print-checkpoint-structure", action="store_true") parser.add_argument( "path_to_checkpoint", type=str, help="Path to the checkpoint file (.zip archive or direct .pt file)", ) parser.add_argument( "--config_file", default="", type=str, help="An optional config json file describing the pre-trained model.", ) args = parser.parse_args() # Extract the basename. basename = os.path.dirname(args.path_to_checkpoint) # Load the model. # the .zip is very optional, let's keep it for backward compatibility print(f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}") if args.path_to_checkpoint.endswith(".zip"): with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint: with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict: input_state_dict = torch.load(pytorch_dict, map_location="cpu") else: input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu") ds_args = input_state_dict.get("args", None) # Read the config, or default to the model released by NVIDIA. if args.config_file == "": if ds_args is not None: if ds_args.bias_gelu_fusion: activation_function = "gelu_fast" elif ds_args.openai_gelu: activation_function = "gelu_new" else: activation_function = "gelu" else: # in the very early days this used to be "gelu_new" activation_function = "gelu_new" # Spell out all parameters in case the defaults change. config = GPT2Config( vocab_size=50257, n_positions=1024, n_embd=1024, n_layer=24, n_head=16, n_inner=4096, activation_function=activation_function, resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, scale_attn_weights=True, use_cache=True, bos_token_id=50256, eos_token_id=50256, ) else: config = GPT2Config.from_json_file(args.config_file) config.architectures = ["GPT2LMHeadModel"] # Convert. print("Converting") output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config) # Print the structure of converted state dict. if args.print_checkpoint_structure: recursive_print(None, output_state_dict) # Add tokenizer class info to config # see https://github.com/huggingface/transformers/issues/13906) if ds_args is not None: tokenizer_type = ds_args.tokenizer_type if tokenizer_type == "GPT2BPETokenizer": tokenizer_model_name = "gpt2" elif tokenizer_type == "PretrainedFromHF": tokenizer_model_name = ds_args.tokenizer_name_or_path else: raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}") else: tokenizer_model_name = "gpt2" tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name) tokenizer_class = type(tokenizer).__name__ config.tokenizer_class = tokenizer_class # Store the config to file. print("Saving config") config.save_pretrained(basename) # Save tokenizer based on args print(f"Adding {tokenizer_class} tokenizer files") tokenizer.save_pretrained(basename) # Store the state_dict to file. output_checkpoint_file = os.path.join(basename, "pytorch_model.bin") print(f'Saving checkpoint to "{output_checkpoint_file}"') torch.save(output_state_dict, output_checkpoint_file)
def main(): # Create the argument parser. parser = argparse.ArgumentParser() parser.add_argument("--print-checkpoint-structure", action="store_true") parser.add_argument( "path_to_checkpoint", type=str, help="Path to the ZIP file containing the checkpoint", ) parser.add_argument( "--config_file", default="", type=str, help="An optional config json file describing the pre-trained model.", ) args = parser.parse_args() # Extract the basename. basename = os.path.dirname(args.path_to_checkpoint) # Load the model. print( f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}") with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint: with checkpoint.open( "release/mp_rank_00/model_optim_rng.pt") as pytorch_dict: input_state_dict = torch.load(pytorch_dict, map_location="cpu") # Read the config, or default to the model released by NVIDIA. if args.config_file == "": # Spell out all parameters in case the defaults change. config = GPT2Config( vocab_size=50257, n_positions=1024, n_ctx=1024, n_embd=1024, n_layer=24, n_head=16, n_inner=4096, activation_function="gelu_new", resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, scale_attn_weights=True, gradient_checkpointing=False, use_cache=True, bos_token_id=50256, eos_token_id=50256, ) else: config = GPT2Config.from_json_file(args.config_file) # Convert. print("Converting") output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config) # Print the structure of converted state dict. if args.print_checkpoint_structure: recursive_print(None, output_state_dict) # Store the config to file. output_config_file = os.path.join(basename, "config.json") output_config = config.to_dict() output_config["architectures"] = ["GPT2LMHeadModel"] output_config["model_type"] = "gpt2" print(f'Saving config to "{output_config_file}"') with open(output_config_file, "w") as f: json.dump(output_config, f) # Store the state_dict to file. output_checkpoint_file = os.path.join(basename, "pytorch_model.bin") print(f'Saving checkpoint to "{output_checkpoint_file}"') torch.save(output_state_dict, output_checkpoint_file)
'distilgpt2-config.json') } GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = { "distilgpt2": os.path.join(model_and_config_dir, 'MODEL_ARCHIVE_MAP', 'distilgpt2-pytorch_model.bin') } if __name__ == '__main__': benchmark_name = "Pereira2018-encoding" model_name = "distilgpt2" config_file = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP[model_name] model_file = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP[model_name] benchmark_tsk = benchmark_name config = GPT2Config.from_json_file(config_file) num_layers = config.n_layer config.output_hidden_states = True config.state_dict = None model = GPT2Model(config) model.from_pretrained(model_file, config=config) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model_identifier = config.weight_identifier # find model index in model_configs config_idx = int( np.argwhere([ x['weight_identifier'] == config.weight_identifier for x in transformer_configurations ])) brainscore_config = transformer_configurations[config_idx] # - tokenizer_ctr: the importable class name of the model's tokenizer class
#将模型的基本参数打印出来 print('args:\n' + args.__repr__()) # datapath='C:/Users/ubt/Desktop/train_max24.tok' # f = open(datapath,encoding = 'utf-8') # for lines in f: # print(len(lines)) # print(lines) os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 print(args.model_config) #if not args.pretrained_model: # model = transformers.modeling_gpt2.GPT2LMHeadModel(config=args.model_config) #else: # model = transformers.modeling_gpt2.GPT2LMHeadModel(config=args.model_config) # Load config file config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config) model.train() model.cuda() num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) full_len = 0 for i in tqdm(range(args.num_pieces)): with open(args.tokenizer_data_path + 'tokenizer_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()])
classes_num=len(labels2idx)).cuda() checkpoint = T.load("Classifier/Model_Backup/model.pt") dialog_act_classifier.load_state_dict(checkpoint['model_state_dict']) dialog_act_classifier = dialog_act_classifier.eval() # Load TTS model with T.no_grad(): text2speech = tts_class() # LOAD DialoGPT Generator with T.no_grad(): tokenizer = GPT2Tokenizer.from_pretrained('Generator/DialoGPT/Configs/') weights = T.load('Generator/DialoGPT/Parameters/medium_ft.pkl') weights_reverse = T.load('Generator/DialoGPT/Parameters/small_reverse.pkl') cfg = GPT2Config.from_json_file('Generator/DialoGPT/Configs/config.json') model = GPT2LMHeadModel(cfg) model_reverse = GPT2LMHeadModel(cfg) # fix misused key value weights["lm_head.weight"] = weights["lm_head.decoder.weight"] weights.pop("lm_head.decoder.weight", None) weights_reverse["lm_head.weight"] = weights_reverse[ "lm_head.decoder.weight"] weights_reverse.pop("lm_head.decoder.weight", None) model.load_state_dict(weights) model.to('cuda') model.eval() model_reverse.load_state_dict(weights_reverse)
def main(): # 初始化参数 args = set_args() # 设置使用哪些显卡进行训练 os.environ["CUDA_VISIBLE_DEVICES"] = args.device args.cuda = not args.no_cuda if args.batch_size < 2048 and args.warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n' \ '(sz_b, warmup) = (2048, 4000) is the official setting.\n' \ 'Using smaller batch w/o longer warmup may cause ' \ 'the warmup stage ends with only little data trained.') # 创建日志对象 logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda:0' if args.cuda else 'cpu' args.device = device logger.info('using device:{}'.format(device)) # 初始化tokenizer tokenizer = BertTokenizerFast(vocab_file=args.vocab_path, sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]") args.sep_id = tokenizer.sep_token_id args.pad_id = tokenizer.pad_token_id args.cls_id = tokenizer.cls_token_id # 创建模型的输出目录 if not os.path.exists(args.save_model_path): os.mkdir(args.save_model_path) # 创建模型 if args.pretrained_model: # 加载预训练模型 model = GPT2LMHeadModel.from_pretrained(args.pretrained_model) else: # 初始化模型 model_config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config=model_config) model = model.to(device) logger.info('model config:\n{}'.format(model.config.to_json_string())) assert model.config.vocab_size == tokenizer.vocab_size # 并行训练模型 if args.cuda and torch.cuda.device_count() > 1: model = DataParallel(model).cuda() # model = BalancedDataParallel(args.gpu0_bsz, model, dim=0).cuda() logger.info("use GPU {} to train".format(args.device)) # 计算模型参数数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() logger.info('number of model parameters: {}'.format(num_parameters)) # 记录参数设置 logger.info("args:{}".format(args)) # 加载训练集和验证集 # ========= Loading Dataset ========= # train_dataset, validate_dataset = load_dataset(logger, args) train(model, logger, train_dataset, validate_dataset, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--device", default="0", type=str, required=False, help="生成设备") parser.add_argument("--length", default=-1, type=int, required=False, help="生成长度") parser.add_argument("--batch_size", default=1, type=int, required=False, help="生成的batch size") parser.add_argument("--nsamples", default=10, type=int, required=False, help="生成几个样本") parser.add_argument("--temperature", default=1, type=float, required=False, help="生成温度") parser.add_argument("--topk", default=8, type=int, required=False, help="最高几选一") parser.add_argument("--topp", default=0, type=float, required=False, help="最高积累概率") parser.add_argument( "--model_config", default="config/model_config.json", type=str, required=False, help="模型参数", ) parser.add_argument( "--tokenizer_path", default="vocab/vocab.txt", type=str, required=False, help="词表路径", ) parser.add_argument( "--model_path", default="model/epoch=0-step=99.ckpt", type=str, required=False, help="模型路径", ) parser.add_argument("--prefix", default="我", type=str, required=False, help="生成文章的开头") parser.add_argument("--no_wordpiece", action="store_true", help="不做word piece切词") parser.add_argument("--segment", action="store_true", help="中文以词为单位") parser.add_argument("--fast_pattern", action="store_true", help="采用更加快的方式生成文本") parser.add_argument("--save_samples", action="store_true", help="保存产生的样本") parser.add_argument("--save_samples_path", default=".", type=str, required=False, help="保存样本的路径") parser.add_argument("--repetition_penalty", default=1.0, type=float, required=False) args = parser.parse_args() print("args:\n" + args.__repr__()) os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = BertTokenizer(vocab_file=args.tokenizer_path) model_config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config=model_config) state_dict = { key[6:]: value for key, value in torch.load(args.model_path, map_location="cpu") ["state_dict"].items() } model.load_state_dict(state_dict) model.to(device) model.eval() for i in range(10): raw_text = args.prefix encoded = tokenizer.encode_plus(raw_text)["input_ids"] out = sample_sequence( model, encoded, length=512, n_ctx=1024, tokenizer=tokenizer, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device, ) print(tokenizer.decode(out))
args.do_train = True args.train_data_file = './prepare_data_dict/text1.txt' args.save_steps = 1 args.block_size = 16 args.line_by_line = False args.train_batch_size = 2 args.num_train_epochs = 3 args.overwrite_output_dir = True #### tokenizer tokenizer = GPT2Tokenizer_inherit( vocab_file='./prepare_data_dict/vocab.json', merges_file='./prepare_data_dict/merge.txt') vocabsz = tokenizer.vocab_size #### config config = GPT2Config.from_json_file( './prepare_data_dict/gpt2-config-small.json') config.vocab_size = vocabsz config.n_layer = 12 config.n_head = 12 #config.output_hidden_states=True main(tokenizer, config) # GPT-2/GPT and causal language modeling # The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before the tokenization). The loss here is that of causal language modeling. # # export TRAIN_FILE=/path/to/dataset/wiki.train.raw # export TEST_FILE=/path/to/dataset/wiki.test.raw # # python run_language_modeling.py \ # --output_dir=output \