def main(): # 设置模型训练参数 args = set_args() # 设置cuda信息 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICE"] = args.device # 获取device信息,用于模型训练 device = torch.device( "cuda" if torch.cuda.is_available() and int(args.device) >= 0 else "cpu" ) # 设置随机种子 if args.seed: torch.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) # 加载模型的config model_config = GPT2Config.from_json_file(args.config_path) if args.pretrained_model_path: model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path) else: # 如果没有指定的预训练模型,则初始化模型 model = GPT2LMHeadModel(config=model_config) # 实例化tokenizer tokenizer = BertTokenizer.from_pretrained(args.vocab_path, do_lower_case=True) # 将[space]作为一个分割整体,例如:"我爱[Space]中国。",使用原始tokenizer分词结果为"['我', '爱', '[', 'Space', ']', '中', '国', '。']"; # 增加分割符号后的结果为"['我', '爱', '[Space]', '中', '国', '。']" tokenizer.add_tokens("[Space]", special_tokens=True) # 创建模型的输出目录 if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # 加载训练数据和测试数据 train_data = GPT2NewsTitleDataSet( tokenizer, args.max_len, args.title_max_len, args.data_dir, "train", args.train_file_path, ) test_data = GPT2NewsTitleDataSet( tokenizer, args.max_len, args.title_max_len, args.data_dir, "test", args.test_file_path, ) # 开始训练 train(model, device, train_data, test_data, args)
def start_sever(): args = set_args() os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICE"] = args.device device = torch.device("cuda" if torch.cuda.is_available() and int(args.device) >= 0 else "cpu") # 实例化tokenizer和model tokenizer = BertTokenizer.from_pretrained(args.vocab_path, do_lower_case=True) model = GPT2LMHeadModel.from_pretrained(args.output_dir) model.to(device) model.eval() print("load model ending!") app = Flask(__name__) @app.route('/') def index(): return "This is News Title Generate Model Server" @app.route('/news-title-generate', methods=['Get', 'POST']) def response_request(): if request.method == 'POST': content = request.form.get('content') titles = predict_one_sample(model, tokenizer, device, args, content) title_str = "" for i, t in enumerate(titles): title_str += "生成的第{}个标题为:{}\n".format(i + 1, t) return render_template("index_ok.html", content=content, titles=title_str) return render_template("index.html") server = wsgi.WSGIServer((str(args.http_id), args.port), app) server.serve_forever()
def setup(data_folder): np.random.seed(0) torch.cuda.manual_seed_all(0) torch.manual_seed(0) codec = get_encoder() dataset = NewsDataset(path=data_folder, ctx_length=128, codec=codec, start_from_zero=True) config = GPT2Config() model = GPT2LMHeadModel(config) if not os.path.exists('gpt2-pytorch_model.bin'): print("Downloading GPT-2 checkpoint...") url = 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin' r = requests.get(url, allow_redirects=True) open('gpt2-pytorch_model.bin', 'wb').write(r.content) model = load_weight( model, torch.load('gpt2-pytorch_model.bin', map_location=device)) model = model.to(device) model.eval() return codec, model, dataset, config
def main(): """主函数""" # 设置预测的配置参数 args = set_args() # 获取设备信息 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICE"] = args.device device = torch.device("cuda" if torch.cuda.is_available() and int(args.device) >= 0 else "cpu") # 实例化tokenizer和model tokenizer = BertTokenizer.from_pretrained(args.vocab_path, do_lower_case=True) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() print('开始对新闻生成标题,输入CTRL + Z,则退出') try: while True: content = input("输入的新闻正文为:") titles = predict_one_sample(model, tokenizer, device, args, content) for i, title in enumerate(titles): print("生成的第{}个标题为:{}".format(i + 1, title)) except: pass
def main(): # 设置模型训练参数 args = set_args() # 设置随机种子,方便模型复现 if args.seed: torch.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) # 加载模型的config model_config = GPT2Config.from_json_file(args.config_path) # 实例化GPT2LMHeadModel模型,这里我们没有加载预训练好的模型,而是直接从头开始训练。 if args.pretrained_model_path: model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path) else: # 如果没有指定的预训练模型,则初始化模型 model = GPT2LMHeadModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.vocab_path, do_lower_case=True) # 将[space]作为一个分割整体,例如:"我爱[Space]中国。",使用原始tokenizer分词结果为"['我', '爱', '[', 'Space', ']', '中', '国', '。']"; # 增加分割符号后的结果为"['我', '爱', '[Space]', '中', '国', '。']" tokenizer.add_tokens("[Space]", special_tokens=True) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # 加载训练数据和测试数据 train_data = GPT2NewsTitleDataSet(tokenizer, args.max_len, args.title_max_len, args.data_dir, "train", args.train_file_path) test_data = GPT2NewsTitleDataSet(tokenizer, args.max_len, args.title_max_len, args.data_dir, "test", args.test_file_path) # 开始训练 train(model, train_data, test_data, args)
def setup(n_enc_layer=1): np.random.seed(0) torch.cuda.manual_seed_all(0) torch.manual_seed(0) codec = get_encoder() config = GPT2Config(n_enc_layer=n_enc_layer) model = GPT2LMHeadModel(config) if not os.path.exists('../gpt2-pytorch_model.bin'): print("Downloading GPT-2 checkpoint...") url = 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin' r = requests.get(url, allow_redirects=True) open('../gpt2-pytorch_model.bin', 'wb').write(r.content) model = load_weight( model, torch.load('../gpt2-pytorch_model.bin', map_location=device)) model = model.to(device) return codec, model, config
def main(): args = set_args() # 实例化tokenizer和model tokenizer = BertTokenizer.from_pretrained(args.vocab_path, do_lower_case=True) model = GPT2LMHeadModel.from_pretrained(args.model_path) if torch.cuda.is_available(): model.cuda() model.eval() print('开始对新闻生成标题,输入CTRL + Z,则退出') try: while True: content = input("输入的新闻正文为:") titles = predict_one_sample(model, tokenizer, args, content) for i, title in enumerate(titles): print("生成的第{}个标题为:{}".format(i + 1, title)) except: pass
def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_dataset", type=str, default="data/corpus.small", help="train dataset") parser.add_argument( "--test_dataset", type=str, default="data/corpus.small", help="test set for evaluation", ) parser.add_argument("--vocab_file", default="gpt2-vocab.json", type=str) parser.add_argument("--merges_file", default="gpt2-merges.txt", type=str) parser.add_argument("--output_path", default="output/", type=str, help="save path") parser.add_argument("--restore_file", default=None, type=str, help="the path for pretrained model") parser.add_argument("--seq_len", type=int, default=128, help="maximum sequence len") parser.add_argument("--batch_size", type=int, default=8, help="number of batch_size") parser.add_argument("--epochs", type=int, default=5, help="number of epochs") parser.add_argument("--num_workers", type=int, default=0, help="dataloader worker size") parser.add_argument("--lr", type=float, default=3e-4, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.98, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") parser.add_argument("--warmup_steps", type=int, default=1000, help="warmup steps") parser.add_argument( "--accumulate_gradient_steps", type=int, default=1, help="accumulate gradient steps", ) args = parser.parse_args() print("building tokenizer") tokenizer = build_tokenizer( vocab_file=args.vocab_file, merges_file=args.merges_file, tokenizer_type="GPT2BPETokenizer", ) print("building train dataset") train_dataset = GPTDataset(args.train_dataset, tokenizer, args.seq_len) print("building test dataset") test_dataset = GPTDataset(args.test_dataset, tokenizer, args.seq_len) print("building train dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) print("building test dataloader") test_data_loader = DataLoader( test_dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, ) print("building model") config = GPT2Config() model = GPT2LMHeadModel(config) if args.restore_file is not None: model.load_state_dict(flow.load(args.restore_file)) model.lm_head.weight = model.transformer.wte.weight trainer = Trainer( model, train_dataloader=train_data_loader, test_dataloader=test_data_loader, epoch=args.epochs, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, warmup_steps=args.warmup_steps, accumulate_gradient_steps=args.accumulate_gradient_steps, output_path=args.output_path, ) print("begin training") trainer.train()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--train_dataset", required=False, type=str, default="data/corpus.small", help="train dataset", ) parser.add_argument( "--test_dataset", type=str, default="data/corpus.small", help="test set for evaluation", ) parser.add_argument("--vocab_file", required=False, default="vocab.json", type=str) parser.add_argument("--merges_file", required=False, default="merge.txt", type=str) parser.add_argument( "--output_path", required=False, default="output/model", type=str, help="save path", ) parser.add_argument("--seq_len", type=int, default=128, help="maximum sequence len") parser.add_argument("--batch_size", type=int, default=4, help="number of batch_size") parser.add_argument("--epochs", type=int, default=50, help="number of epochs") parser.add_argument("--num_workers", type=int, default=0, help="dataloader worker size") parser.add_argument( "--with_cuda", type=bool, default=True, help="training with CUDA: true, or false", ) parser.add_argument("--lr", type=float, default=1e-4, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") args = parser.parse_args() print("building tokenizer") tokenizer = build_tokenizer( vocab_file=args.vocab_file, merges_file=args.merges_file, tokenizer_type="GPT2BPETokenizer", ) print("building train dataset") train_dataset = GPTDataset(args.train_dataset, tokenizer, args.seq_len) print("building train dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) for i, b in enumerate(train_data_loader): if i == 2: batch = b break of_batch = batch.cuda() print("building model") config = GPT2Config() pt_batch = torch.from_numpy(batch.numpy()).long().cuda() model = pt_GPT2LMHeadModel(config) model.load_state_dict(torch.load("gpt2_model.pt")) model.lm_head.weight = model.transformer.wte.weight model.cuda() model.eval() learning_rate = 0.01 mom = 0.9 pt_optimizer = torch.optim.AdamW( model.parameters(), lr=0.0001, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, ) for_time = 0.0 bp_time = 0.0 update_time = 0.0 pt_loss = list() loss = None print("start pytorch training loop....") start_t = time.time() for epoch in range(args.epochs): s_t = time.time() loss = model(pt_batch, labels=pt_batch)[0] for_time += time.time() - s_t pt_loss.append(loss.item()) s_t = time.time() loss.backward() bp_time += time.time() - s_t s_t = time.time() pt_optimizer.step() pt_optimizer.zero_grad() update_time += time.time() - s_t end_t = time.time() print("pytorch traning loop avg time : {}".format( (end_t - start_t) / args.epochs)) print("forward avg time : {}".format(for_time / args.epochs)) print("backward avg time : {}".format(bp_time / args.epochs)) print("update parameters avg time : {}".format(update_time / args.epochs)) pt_parameters_names = [] pt_parameters_value = [] for name, param in model.named_parameters(): pt_parameters_names.append(name) pt_parameters_value.append(param.cpu().detach().numpy()) model = GPT2LMHeadModel(config) model.load_state_dict(flow.load("gpt2_oneflow_model")) model.lm_head.weight = model.transformer.wte.weight model.cuda() model.eval() optimizer = flow.optim.AdamW( model.parameters(), lr=0.0001, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, ) for_time = 0.0 bp_time = 0.0 update_time = 0.0 of_loss = list() print("start oneflow training loop....") start_t = time.time() for epoch in range(args.epochs): s_t = time.time() loss = model(of_batch, labels=of_batch)[0] for_time += time.time() - s_t of_loss.append(loss.numpy()) s_t = time.time() loss.backward() bp_time += time.time() - s_t s_t = time.time() optimizer.step() optimizer.zero_grad() update_time += time.time() - s_t end_t = time.time() print("oneflow traning loop avg time : {}".format( (end_t - start_t) / args.epochs)) print("forward avg time : {}".format(for_time / args.epochs)) print("backward avg time : {}".format(bp_time / args.epochs)) print("update parameters avg time : {}".format(update_time / args.epochs)) for i in range(args.epochs): print(i, of_loss[i], pt_loss[i]) import matplotlib.pyplot as plt plt.switch_backend("agg") epochs = np.arange(1, args.epochs + 1) plt.plot(epochs, of_loss, label="oneflow") plt.plot(epochs, pt_loss, label="pytorch") plt.legend() plt.savefig("./1.jpg") plt.show()
def text_generator(state_dict): parser = argparse.ArgumentParser() parser.add_argument("--text", type=str, required=True) parser.add_argument("--quiet", type=bool, default=False) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=float, default=0.7) parser.add_argument("--top_k", type=int, default=40) args = parser.parse_args() if args.quiet is False: print(args) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 seed = random.randint(0, 2147483647) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load Model enc = get_encoder() config = GPT2Config() model = GPT2LMHeadModel(config) model = load_weight(model, state_dict) model.to(device) model.eval() if args.length == -1: args.length = config.n_ctx // 2 elif args.length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) print(args.text) context_tokens = enc.encode(args.text) generated = 0 for _ in range(args.nsamples // args.batch_size): out = sample_sequence( model=model, length=args.length, context=context_tokens if not args.unconditional else None, start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None, batch_size=args.batch_size, temperature=args.temperature, top_k=args.top_k, device=device) out = out[:, len(context_tokens):].tolist() for i in range(args.batch_size): generated += 1 text = enc.decode(out[i]) if args.quiet is False: print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text)
def text_generator(): parser = argparse.ArgumentParser() parser.add_argument("--text", type=str, required=True) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument( "--unconditional", action="store_true", help="If true, unconditional generation.", ) parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=float, default=0.7) parser.add_argument("--top_k", type=int, default=40) parser.add_argument("--seed", type=int, default=1234) args = parser.parse_args() print(args) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 random.seed(args.seed) np.random.seed(args.seed) flow.manual_seed(args.seed) device = flow.device("cuda") tokenizer = build_tokenizer(vocab_file="vocab.json", merges_file="merge.txt") config = GPT2Config() model = GPT2LMHeadModel(config) # convert_pt_checkpoint_to_of(model, pt_checkpoint_path="gpt2-pytorch_model.bin", of_checkpoint_path="gpt2_oneflow_model") state_dict = flow.load("gpt2_oneflow_model") model.load_state_dict(state_dict) model.tie_embeddings() model.to(device) model.eval() if args.length == -1: args.length = config.n_ctx // 2 elif args.length > config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx) text = args.text print(text) context_tokens = tokenizer.tokenize(text) generated = 0 for _ in range(args.nsamples // args.batch_size): out = sample_sequence( model=model, length=args.length, context=context_tokens if not args.unconditional else None, start_token=tokenizer.vocab["<|endoftext|>"] if args.unconditional else None, batch_size=args.batch_size, temperature=args.temperature, top_k=args.top_k, device=device, ) out = out[:, len(context_tokens):].tolist() for i in range(args.batch_size): generated += 1 text = tokenizer.detokenize(out[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text)
def get_model(device, vocab_path, model_path): tokenizer = BertTokenizer.from_pretrained(vocab_path, do_lower_case=True) model = GPT2LMHeadModel.from_pretrained(model_path) model.to(device) model.eval() return tokenizer, model
def main(): parser = argparse.ArgumentParser() parser.add_argument("--vocab_file", default="gpt2-vocab.json", type=str) parser.add_argument("--merges_file", default="gpt2-merges.txt", type=str) parser.add_argument( "--restore_file", default="gpt2_oneflow_model", type=str, help="Path to pre-trained model", ) parser.add_argument("--prompt", type=str, default="") parser.add_argument("--length", type=int, default=20) parser.add_argument("--temperature", type=float, default=1.0) parser.add_argument("--top_k", type=int, default=1) parser.add_argument("--top_p", type=float, default=0.9) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") args = parser.parse_args() args.device = flow.device("cuda" if not args.no_cuda else "cpu") set_seed(args) tokenizer = build_tokenizer( vocab_file=args.vocab_file, merges_file=args.merges_file, tokenizer_type="GPT2BPETokenizer", ) config = GPT2Config() model = GPT2LMHeadModel(config) if args.restore_file is not None: model.load_state_dict(flow.load(args.restore_file)) model.lm_head.weight = model.transformer.wte.weight model.to(args.device) model.eval() if args.length < 0 and config.max_position_embeddings > 0: args.length = config.max_position_embeddings elif 0 < config.max_position_embeddings < args.length: args.length = (config.max_position_embeddings ) # No generation bigger than model size elif args.length < 0: args.length = MAX_LENGTH # avoid infinite loop print(args) while True: raw_text = args.prompt if args.prompt else input("Model prompt >>> ") context_tokens = tokenizer.tokenize(raw_text) out = sample_sequence( model=model, context=context_tokens, length=args.length, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, device=args.device, ) out = out[0, len(context_tokens):].tolist() text = tokenizer.detokenize(out) print(text) if args.prompt: break return text