def getModel(modelname): global current_modelname, current_modeltime, current_model, current_tokenizer if current_modelname != modelname or current_modeltime != getModelModificationTime( modelname): app.logger.info(f"loading {modelname}") current_modelname = modelname current_modeltime = getModelModificationTime(modelname) # Code used before adding a models directory to support multiple models (which may have different vocab) # parser.add_argument('--tokenizer_path', default='cache/vocab_processed.txt', type=str, required=False, help='词表路径') # tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) tokenizer_path = f'models/{modelname}/cache/vocab_processed.txt' current_tokenizer = tokenization_bert.BertTokenizer( vocab_file=tokenizer_path) app.logger.info(f"tokenizer loaded from {tokenizer_path}") # Code used before adding a models directory to support multiple models # parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径') # model = GPT2LMHeadModel.from_pretrained(args.model_path) model_path = f'models/{modelname}/final_model' current_model = GPT2LMHeadModel.from_pretrained(model_path) current_model.to(device) current_model.eval() app.logger.info(f"model loaded from {model_path}") return current_model, current_tokenizer
def main(): parser = argparse.ArgumentParser() parser.add_argument('--tokenizer_path', default='cache/vocab.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--segment', action='store_true', help='中文以词为单位') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path # if raw: # print('building files') # build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, # num_pieces=num_pieces) # print('files built') raw_data_files = [ join(raw_data_path, f) for f in listdir(raw_data_path) if isfile(join(raw_data_path, f)) ] random.shuffle(raw_data_files) each_size = len(raw_data_files) // 8 split_raw_data_files = [] for i in range(8): split_raw_data_files.append(raw_data_files[i * each_size:(i + 1) * each_size]) def tokenization(index, raw_data_files): for file_path in raw_data_files[index]: get_tokenization(file_path, tokenized_data_path, full_tokenizer) xmp.spawn(tokenization, args=(split_raw_data_files, ), nprocs=8, start_method='fork')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=20, type=int, required=False, help='最短收录文章长度') parser.add_argument('--n_ctx', default=50, type=int, required=False, help='训练样本长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--padding', action='store_true', help='padding') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") parser.add_argument('--max_steps_perEpoch_perPiece', default=1000000, type=int, required=False) parser.add_argument('--steps_savemodel', default=10000, type=int, required=False, help='保存模型步数') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path num_pieces = args.num_pieces min_length = args.min_length n_ctx = args.n_ctx padding = args.padding print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length,n_ctx=n_ctx, padding=padding) print('files built')
def __init__(self, model_path, tokenizer_path): device = "cuda" if torch.cuda.is_available() else "cpu" model = GPT2LMHeadModel.from_pretrained(model_path) model.to(device) model.eval() tokenizer = tokenization_bert_word_level.BertTokenizer( vocab_file=tokenizer_path) vocab = Gpt2Vocab(tokenizer) self.device = device self.model = model self.vocab = vocab self.tokenizer = tokenizer
def tokenizer_test(): segment = False if segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert full_tokenizer = tokenization_bert.BertTokenizer( vocab_file='./data/text.data/vocab_processed.txt') full_tokenizer.max_len = 100 line = '你还不了解我,蛛哥就知道,我很快就毛事了,只是被好朋友误会有点不好受' line1 = full_tokenizer.tokenize(line) print(line1) ids = full_tokenizer.convert_tokens_to_ids(line1) print(ids)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces output_dir = args.output_dir # if raw: # print('building files') # build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, # num_pieces=num_pieces) # print('files built') raw_data_files = [ join(raw_data_path, f) for f in listdir(raw_data_path) if isfile(join(raw_data_path, f)) ] random.shuffle(raw_data_files) def train_model(index): device = xm.xla_device() torch.manual_seed(0) if not os.path.exists(tokenized_data_path): os.mkdir(tokenized_data_path) if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) model.load_state_dict(torch.load(output_dir + 'final_model')) model.train() model.to(device) multi_gpu = False full_len = 0 # print('calculating total steps') # for i in tqdm(range(num_pieces)): # with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: # full_len += len([int(item) for item in f.read().strip().split()]) # total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) # print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) # scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps) # if fp16: # try: # from apex import amp # except ImportError: # raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") # model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) # if torch.cuda.device_count() > 1: # print("Let's use", torch.cuda.device_count(), "GPUs!") # model = DataParallel(model) # multi_gpu = True if xm.is_master_ordinal(): print('starting training') doc_size = 10 raw_data_batch_len = len(raw_data_files) // doc_size for epoch in range(epochs): if xm.is_master_ordinal(): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) for batch_len in range(raw_data_batch_len): train_dataset = TextDataset( raw_data_files[batch_len * doc_size:(batch_len + 1) * doc_size], tokenized_data_path, full_tokenizer, n_ctx) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) # Creates dataloaders, which load data in batches # Note: test loader is not shuffled or sampled train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=8, drop_last=True) # tokens = get_tokenization(raw_data_file, tokenized_data_path, full_tokenizer) # if tokens is None: # continue # start_point = 0 # samples = [] # while start_point < len(tokens) - n_ctx: # samples.append(tokens[start_point: start_point + n_ctx]) # start_point += stride # if start_point < len(tokens): # samples.append(tokens[len(tokens) - n_ctx:]) # random.shuffle(samples) para_train_loader = pl.ParallelLoader( train_loader, [device]).per_device_loader(device) running_loss = 0 for step, batch_inputs in enumerate(para_train_loader): # for step in range(len(samples) // batch_size): # prepare data # batch = samples[step * batch_size: (step + 1) * batch_size] # batch_labels = [] # batch_inputs = [] # for ids in batch: # int_ids_for_labels = [int(x) for x in ids] # int_ids_for_inputs = [int(x) for x in ids] # batch_labels.append(int_ids_for_labels) # batch_inputs.append(int_ids_for_inputs) # print(batch_inputs) batch_inputs = batch_inputs.to(device) # print(batch_labels.size(), batch_inputs.size()) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss # if multi_gpu: # loss = loss.mean() # if gradient_accumulation > 1: # loss = loss / gradient_accumulation optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) xm.optimizer_step(optimizer) # if (step + 1) % gradient_accumulation == 0: # running_loss += loss.item() # optimizer.step() # xm.optimizer_step(optimizer) # optimizer.zero_grad() # scheduler.step() if xm.is_master_ordinal(): if (step + 1) % log_step == 0: print( 'now time: {}:{}. Step {}/{} of pice {}/{} epoch {}, loss {}' .format(datetime.now().hour, datetime.now().minute, (step + 1), len(para_train_loader), batch_len + 1, raw_data_batch_len, epoch + 1, running_loss / log_step)) running_loss = 0 else: running_loss += loss.item() xm.save(model.state_dict(), output_dir + 'final_model') if xm.is_master_ordinal(): gc.collect() xmp.spawn(train_model, args=(), nprocs=8, start_method='fork')
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--device", default="0,1,2,3", type=str, required=False, help="设置使用哪些显卡" ) parser.add_argument( "--model_config", default="config/model_config_small.json", type=str, required=False, help="选择模型参数", ) parser.add_argument( "--tokenizer_path", default="cache/vocab_small.txt", type=str, required=False, help="选择词库", ) parser.add_argument( "--raw_data_path", default="data/train.json", type=str, required=False, help="原始训练语料", ) parser.add_argument( "--tokenized_data_path", default="data/tokenized/", type=str, required=False, help="tokenized语料存放位置", ) parser.add_argument("--raw", action="store_true", help="是否先做tokenize") parser.add_argument("--epochs", default=5, type=int, required=False, help="训练循环") parser.add_argument( "--batch_size", default=8, type=int, required=False, help="训练batch size" ) parser.add_argument("--lr", default=1.5e-4, type=float, required=False, help="学习率") parser.add_argument( "--warmup_steps", default=2000, type=int, required=False, help="warm up步数" ) parser.add_argument( "--log_step", default=1, type=int, required=False, help="多少步汇报一次loss,设置为gradient accumulation的整数倍", ) parser.add_argument( "--stride", default=768, type=int, required=False, help="训练时取训练数据的窗口步长" ) parser.add_argument( "--gradient_accumulation", default=1, type=int, required=False, help="梯度积累" ) parser.add_argument("--fp16", action="store_true", help="混合精度") parser.add_argument("--fp16_opt_level", default="O1", type=str, required=False) parser.add_argument("--max_grad_norm", default=1.0, type=float, required=False) parser.add_argument( "--num_pieces", default=100, type=int, required=False, help="将训练语料分成多少份" ) parser.add_argument( "--min_length", default=128, type=int, required=False, help="最短收录文章长度" ) parser.add_argument( "--output_dir", default="model/", type=str, required=False, help="模型输出路径" ) parser.add_argument( "--pretrained_model", default="", type=str, required=False, help="模型训练起点路径" ) parser.add_argument( "--writer_dir", default="tensorboard_summary/", type=str, required=False, help="Tensorboard路径", ) parser.add_argument("--segment", action="store_true", help="中文以词为单位") parser.add_argument("--bpe_token", action="store_true", help="subword") parser.add_argument( "--encoder_json", default="tokenizations/encoder.json", type=str, help="encoder.json", ) parser.add_argument( "--vocab_bpe", default="tokenizations/vocab.bpe", type=str, help="vocab.bpe" ) args = parser.parse_args() print("args:\n" + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config ) print("config:\n" + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = "cuda" if torch.cuda.is_available() else "cpu" print("using device:", device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print("building files") build_files( data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length, ) print("files built") if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model ) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print("number of parameters: {}".format(num_parameters)) multi_gpu = False full_len = 0 print("calculating total steps") for i in tqdm(range(num_pieces)): with open(tokenized_data_path + "tokenized_train_{}.txt".format(i), "r") as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print("total steps = {}".format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps ) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model, device_ids=[int(i) for i in args.device.split(",")]) multi_gpu = True print("starting training") overall_step = 0 running_loss = 0 saving_time = datetime.now() for epoch in range(epochs): print("epoch {}".format(epoch + 1)) now = datetime.now() print("time: {}".format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open( tokenized_data_path + "tokenized_train_{}.txt".format(i), "r" ) as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point : start_point + n_ctx]) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx :]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size : (step + 1) * batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm ) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar( "loss", loss.item() * gradient_accumulation, overall_step ) print( "now time: {}:{}. Step {} of piece {} of epoch {}, loss {}".format( datetime.now().hour, datetime.now().minute, step + 1, piece_num, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation), ) ) running_loss = 0 delta_time = datetime.now() - saving_time if delta_time.seconds > 1800: print("saving model for epoch {}".format(epoch + 1)) if not os.path.exists( output_dir + "model_epoch{}".format(epoch + 1) ): os.mkdir(output_dir + "model_epoch{}".format(epoch + 1)) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained( output_dir + "model_epoch{}".format(epoch + 1) ) saving_time = datetime.now() overall_step += 1 piece_num += 1 print("saving model for epoch {}".format(epoch + 1)) if not os.path.exists(output_dir + "model_epoch{}".format(epoch + 1)): os.mkdir(output_dir + "model_epoch{}".format(epoch + 1)) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir + "model_epoch{}".format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print("epoch {} finished".format(epoch + 1)) then = datetime.now() print("time: {}".format(then)) print("time for one epoch: {}".format(then - now)) print("training finished") if not os.path.exists(output_dir + "final_model"): os.mkdir(output_dir + "final_model") model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir + "final_model")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度,越高越随机') parser.add_argument('--topk', default=8, type=int, required=False, help='生成的时候最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='生成的时候积累概率最高多少') # parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, # help='模型参数路径') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径') parser.add_argument('--save_path', default='generated/', type=str, required=False, help='存放生成的文件的路径') parser.add_argument('--articles_per_title', default=5, type=int, required=False, help='每个标题生成多少篇文章') parser.add_argument('--titles', default='萧炎', type=str, required=False, help='标题列表,是一个字符串,用空格分开') parser.add_argument('--titles_file', default='', type=str, required=False, help='标题列表文件,文件中每行一个标题。如果这个选项有值则titles无效') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False) args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty titles = args.titles.split() # 列表,里面每个元素是一个生成的标题 if args.titles_file: with open(args.titles_file, 'r') as f: titles = [line.strip('\n') for line in f.readlines()] articles_per_title = args.articles_per_title # 这里定义一个标题生成多少篇文章 save_path = args.save_path # 设置存到哪 device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() n_ctx = model.config.n_ctx if not os.path.exists(save_path): os.mkdir(save_path) if length == -1: length = model.config.n_ctx for i, title in enumerate(titles): for j in range(articles_per_title): with open(save_path + str(i) + '-' + str(j) + '.txt', 'w') as f: context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(title)) generated = 0 out = sample_sequence(n_ctx=n_ctx, model=model, length=length, context=context_tokens, tokenizer=tokenizer, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device) out = out.tolist()[0] generated += 1 text = tokenizer.convert_ids_to_tokens(out) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]' or item == '[UNK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': text[i] = '\n' print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) text = ''.join(text).replace('##', '').strip() # text = ''.join(text.split('\n')[:-1]) print(text) f.write(text + '\n') print("=" * 80)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='gpt2/config.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=150, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=1, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=100, type=int, required=False, help='warm up步数') # parser.add_argument('--log_step', default=2, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍') parser.add_argument('--stride', default=384, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--output_dir', default='model_classfier/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') # parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.GPT2Config.from_json_file(args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda:0' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path # tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps # log_step = args.log_step # stride = args.stride gradient_accumulation = args.gradient_accumulation # fp16 = args.fp16 # 不支持半精度的显卡请勿打开 # fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm # num_pieces = args.num_pieces # min_length = args.min_length output_dir = args.output_dir # tb_writer = SummaryWriter(log_dir=args.writer_dir) # assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') resources, resources_id, input_question_list, max_aq_len = build_files( data_path=raw_data_path, full_tokenizer=full_tokenizer) print('files built') input_ids = [] * len(resources_id) # labels = [] for i in range(len(resources_id)): inputsss, _ = sliding_window(max_len=512, resources=resources_id[i], stride=512 - 128) input_ids.append(inputsss) # labels = labels + [choices['label']] * len(inputsss) print('sliding built') if True: # shuffle index = [i for i in range(len(input_ids))] random.shuffle(index) new_input_ids = [input_ids[i] for i in index] new_input_question_list = [input_question_list[i] for i in index] input_ids = new_input_ids val_rate = 0.1 split = int((1 - val_rate) * len(input_ids)) val_input_ids = input_ids[split:] val_input_question_list = input_question_list[split:] input_ids = input_ids[:split] input_question_list = input_question_list[:split] # train_dataset = my_dataset(x=input_ids, y=labels, token_type_ids=token_type_ids) # train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1) # if not args.pretrained_model: # model = transformers.models.gpt2.GPT2LMHeadModel(config=model_config) # else: # model = transformers.models.gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model) model = modelMy(args, device) model.to(device) # old_parameter = model.f**k.weight.clone() # num_parameters = 0 # parameters = model.parameters() # for parameter in parameters: # num_parameters += parameter.numel() # print('number of parameters: {}'.format(num_parameters)) # param_optimizer = [p for n, p in model.named_parameters() if p.requires_grad] multi_gpu = False print('calculating total steps') # for i in tqdm(range(num_pieces)): # with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: # full_len += len([int(item) for item in f.read().strip().split()]) optimizer = transformers.optimization.AdamW(model.parameters(), lr=lr, weight_decay=0.01, correct_bias=True) scheduler = transformers.optimization.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=1500, num_training_steps=args.epochs * len(input_ids)) # scheduler = transformers.optimization.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=10, num_training_steps = args.) # from pytorch_pretrained_bert.optimization import BertAdam # optimizer = BertAdam(model.parameters(), # lr=0.1, # warmup=0.1, # t_total=100) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel( model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 best_loss = 9999999 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) # x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) acc_s = 0 piece_num = 0 model.train() for step in range(len(input_ids)): # paper by paper # print ("step:{}".format(step)) # if (overall_step + 2) % gradient_accumulation == 0: # break batch_inputs = input_ids[step] batch_inputs = torch.tensor(batch_inputs).long().to( device).unsqueeze(0) batch_questions = [ z['Question_token'] for z in input_question_list[step][:] ] batch_questions = torch.tensor(batch_questions).long().to( device).unsqueeze(0) batch_choices = [ z['Choices_token'] for z in input_question_list[step][:] ] batch_choices = torch.tensor(batch_choices).long().to( device).unsqueeze(0) batch_labels = [z['Goal'] for z in input_question_list[step][:]] batch_labels = torch.tensor(batch_labels).long().to( device).unsqueeze(0) # forward pass outputs = model.forward(inputs=batch_inputs, questions=batch_questions, choices=batch_choices, labels=batch_labels) loss, pred, acc = outputs acc_s += (acc.cpu()) running_loss += loss.item() # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward loss.backward() if (overall_step + 1) % gradient_accumulation == 0: optimizer.step() scheduler.step() overall_step += 1 # print("backwards") # if (overall_step + 1) % 1 == 0: overall_step += 1 piece_num += 1 running_loss = running_loss / len(input_ids) print('now time: {}:{}. epoch {}, loss {}, acc {:.6f}'.format( datetime.now().hour, datetime.now().minute, epoch + 1, running_loss * 1000, acc_s / len(input_ids) # acc_s / 16 # acc_s / 800 )) #--------------------------------- running_loss = running_loss * gradient_accumulation / len(resources) if running_loss < best_loss: best_loss = running_loss print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'loss.best', optimizer, epoch) running_loss = 0 model.eval() val_accs = 0 for stepp in range(len(val_input_ids)): batch_inputs = val_input_ids[stepp] batch_inputs = torch.tensor(batch_inputs).long().to( device).unsqueeze(0) batch_questions = [ z['Question_token'] for z in val_input_question_list[stepp][:] ] batch_questions = torch.tensor(batch_questions).long().to( device).unsqueeze(0) batch_choices = [ z['Choices_token'] for z in val_input_question_list[stepp][:] ] batch_choices = torch.tensor(batch_choices).long().to( device).unsqueeze(0) batch_labels = [ z['Goal'] for z in val_input_question_list[stepp][:] ] batch_labels = torch.tensor(batch_labels).long().to( device).unsqueeze(0) # forward pass outputs = model.forward(inputs=batch_inputs, questions=batch_questions, choices=batch_choices, labels=batch_labels, training=False) loss, pred, acc = outputs val_accs += (acc) print('validation acc {}'.format(val_accs / (len(val_input_ids)))) # print('validation acc {}'.format(val_accs)) print('epoch {} finished'.format(epoch + 1)) #------------------ then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final.best', optimizer, epoch)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='生成设备') parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度') parser.add_argument('--batch_size', default=1, type=int, required=False, help='生成的batch size') parser.add_argument('--nsamples', default=10, type=int, required=False, help='生成几个样本') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度') parser.add_argument('--topk', default=8, type=int, required=False, help='最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径') parser.add_argument('--prefix', default='萧炎', type=str, required=False, help='生成文章的开头') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--fast_pattern', action='store_true', help='采用更加快的方式生成文本') parser.add_argument('--save_samples', action='store_true', help='保存产生的样本') parser.add_argument('--save_samples_path', default='.', type=str, required=False, help="保存样本的路径") parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False) args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() n_ctx = model.config.n_ctx if length == -1: length = model.config.n_ctx if args.save_samples: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/samples.txt', 'w', encoding='utf8') while True: raw_text = args.prefix context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(raw_text)) generated = 0 for _ in range(nsamples // batch_size): out = generate(n_ctx=n_ctx, model=model, context=context_tokens, length=length, is_fast_pattern=args.fast_pattern, tokenizer=tokenizer, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device) for i in range(batch_size): generated += 1 text = tokenizer.convert_ids_to_tokens(out) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' elif item == '[CLS]': text[i] = '\n\n' elif item == '[SEP]': text[i] = '\n' info = "=" * 40 + " SAMPLE " + str( generated) + " " + "=" * 40 + "\n" print(info) text = ''.join(text).replace('##', '').replace('[UNK]', ' ').strip() print(text) if args.save_samples: samples_file.write(info) samples_file.write(text) samples_file.write('\n') samples_file.write('=' * 90) samples_file.write('\n' * 2) print("=" * 80) if generated == nsamples: # close file when finish writing. if args.save_samples: samples_file.close() break
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=64, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--max_length', default=256, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") parser.add_argument('--max_steps_perEpoch_perPiece', default=1000000, type=int, required=False) parser.add_argument('--steps_savemodel', default=10000, type=int, required=False, help='保存模型步数') parser.add_argument('--padding', action='store_true', help='输入是否定长') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert #os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir padding = args.padding max_length = args.max_length #tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) #scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, # t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True print('starting training') step_loss = 0 running_loss = 10 loss_ = 10 iter = iterData(args.tokenized_data_path, rate=1.0, batch_size=batch_size, epochs=epochs) step = 0 epoch0 = -1 while True: data = next(iter) if data=='__STOP__': break epoch, epochs, idx_file, nb_files, batch_inputs = data random.shuffle(batch_inputs) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() step_loss += 1 #scheduler.step() if (step + 1) % log_step == 0: loss_ = running_loss * gradient_accumulation / (log_step / gradient_accumulation) print('now time: {}:{}. step: {}, progress-innerEpoch: {}/{}, progress-outerEpoch: {}/{}, loss {}'.format( datetime.now().hour, datetime.now().minute, step+1, idx_file+1, nb_files, epoch + 1, epochs, loss_)) running_loss = 0 if step%args.steps_savemodel==0: print('saving model for epoch {}'.format(epoch + 1)) output_dir_ = output_dir + 'model_epoch{}_step{}_loss-{}'.format(epoch + 1, step,'%0.2f'%loss_) if not os.path.exists(output_dir_): os.mkdir(output_dir_) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir_) step += 1 if epoch!=epoch0: if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) epoch0 = epoch print('epoch {} finished'.format(epoch + 1)) if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model') print('training finished')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='設置使用哪些顯卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='選擇模型參數') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='選擇詞庫') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始訓練語料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized語料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='訓練迴圈') parser.add_argument('--batch_size', default=8, type=int, required=False, help='訓練batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='學習率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步數') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步彙報一次loss,設置為gradient accumulation的整數倍') parser.add_argument('--stride', default=768, type=int, required=False, help='訓練時取訓練資料的視窗步長') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度積累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='將訓練語料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收錄文章長度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型輸出路徑') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型訓練起點路徑') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路徑') parser.add_argument('--segment', action='store_true', help='中文以詞為單位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此處設置程式使用哪些顯卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 選擇是否從零開始構建資料集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的顯卡請勿打開 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel( model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step) print( 'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}' .format( datetime.now().hour, datetime.now().minute, step + 1, piece_num, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation))) running_loss = 0 overall_step += 1 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
s.update(ts) #import pdb;pdb.set_trace() _ = list(map(process, lines)) keys = [word for word in s] with open(output_path, 'w') as fh: keys = [ key + '\n' for key in ['[SEP]', '[PAD]', '[CLS]', '[MASK]', '[UNK]'] + keys ] fh.writelines(keys) #generate_vocab('data/train.json', 'cache/vocab_wiki_small_new.txt') full_tokenizer = tokenization_bert.BertTokenizer( vocab_file='cache/vocab_wiki_small.txt') def tokenize_list(word_list): for word in word_list: print(full_tokenizer.convert_tokens_to_ids(word)) tokenize_list(['中国', '政府', '今天', '猫']) #with open('/home/t-linan/projects/GPT2-Chinese/data/tokenized/tokenized_train_0.txt', 'r') as fh: with open('data/tokenized/tokenized_train_0_False.txt', 'r') as fh: line = fh.readlines()[0] numbers = line.strip().split() print(len(numbers)) print(sum([int(number) == 4 for number in numbers])) """
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='4,5,6,7', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--do_train', action='store_true', help='do train') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") parser.add_argument('--max_steps_perEpoch_perPiece', default=1000000, type=int, required=False) parser.add_argument('--steps_savemodel', default=10000, type=int, required=False, help='保存模型步数') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert #os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') if not args.do_train: return trainfiles = os.listdir(args.tokenized_data_path) num_pieces = len(trainfiles) if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 ''' print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) ''' optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) #scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, # t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model, device_ids=[i for i in range(4)]) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) iter = iterData(path_data=tokenized_data_path) while True: Data = next(iter) if Data == '__STOP__': break piece_num, _, samples = Data random.shuffle(samples) nb_steps = len(samples) // batch_size for step in range(nb_steps): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() #scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step) print( 'now time: {}:{}. Step {} (total {}) of piece {} (total {}) of epoch {}, loss {}' .format( datetime.now().hour, datetime.now().minute, step + 1, nb_steps, piece_num, num_pieces, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation))) running_loss = 0 if overall_step % args.steps_savemodel == 0: print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}_step{}'. format(epoch + 1, overall_step)): os.mkdir(output_dir + 'model_epoch{}_step{}'.format( epoch + 1, overall_step)) model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained( output_dir + 'model_epoch{}_step{}'.format(epoch + 1, overall_step)) overall_step += 1 piece_num += 1 if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model') print('training finished')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--path_generateConfig', default='config.json', type=str, required=False, help='生成配置') parser.add_argument('--path_texts', default='texts.txt', type=str, required=False, help='文本集') parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='生成设备') parser.add_argument('--length', default=50, type=int, required=False, help='生成长度') parser.add_argument('--batch_size', default=1, type=int, required=False, help='生成的batch size') parser.add_argument('--nsamples', default=10, type=int, required=False, help='生成几个样本') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度') parser.add_argument('--topk', default=8, type=int, required=False, help='最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='模型参数') parser.add_argument('--tokenizer_path', default='data/vocab.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/model-test/model_epoch2931/', type=str, required=False, help='模型路径') parser.add_argument('--prefix', default='萧炎', type=str, required=False, help='生成文章的开头') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--fast_pattern', default=True, action='store_true', help='采用更加快的方式生成文本') parser.add_argument('--save_samples', default=True, action='store_true', help='保存产生的样本') parser.add_argument('--save_samples_path', default='./test/', type=str, required=False, help="保存样本的路径") parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False) parser.add_argument('--use_gpu', default=False, help='是否使用GPU') args = parser.parse_args() print('args:\n' + args.__repr__()) if os.path.exists(args.path_generateConfig): with open(args.path_generateConfig, 'r') as f: config = json.load(f) args.nsamples = config['nsamples'] args.model_config = config['model_config'] args.tokenizer_path = config['tokenizer_path'] args.model_path = config['model_path'] args.save_samples_path = config['save_samples_path'] if os.path.exists(args.path_texts): with open(args.path_texts, 'r') as f: texts = f.read().strip().split('\n') else: texts = [args.prefix] if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty if args.use_gpu: device = "cuda" if torch.cuda.is_available() else "cpu" else: device = 'cpu' tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() params = list(model.parameters()) k = 0 for i in params: l = 1 #print("该层的结构:" + str(list(i.size()))) for j in i.size(): l *= j #print("该层参数和:" + str(l)) k = k + l print("总参数数量和:" + str(k)) n_ctx = model.config.n_ctx if length == -1: length = model.config.n_ctx for prefix in texts: if args.save_samples: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open( os.path.join(args.save_samples_path, 'samples_' + prefix + '.txt'), 'w') print( os.path.join(args.save_samples_path, 'samples_' + prefix + '.txt')) while True: raw_text = prefix context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(raw_text)) generated = 0 for _ in range(nsamples // batch_size): out = generate(n_ctx=n_ctx, model=model, context=context_tokens, length=length, is_fast_pattern=args.fast_pattern, tokenizer=tokenizer, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device) for i in range(batch_size): generated += 1 text = tokenizer.convert_ids_to_tokens(out) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' elif item == '[CLS]': text[i] = '\n\n' elif item == '[SEP]': text[i] = '\n' info = "=" * 40 + " SAMPLE " + str( generated) + " " + "=" * 40 + "\n" print(info) text = ''.join(text).replace('##', '').strip() # print(text) print(text.split('\n')[0]) if args.save_samples: samples_file.write(info) samples_file.write(text.split('\n')[0]) samples_file.write('\n') samples_file.write('=' * 90) samples_file.write('\n' * 2) print("=" * 80) if generated == nsamples: # close file when finish writing. if args.save_samples: samples_file.close() break
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='設置使用哪些顯卡') parser.add_argument('--length', default=-1, type=int, required=False, help='生成長度') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成溫度,越高越隨機') parser.add_argument('--topk', default=8, type=int, required=False, help='生成的時候最高幾選一') parser.add_argument('--topp', default=0, type=float, required=False, help='生成的時候積累概率最高多少') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='模型參數路徑') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='詞表路徑') parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路徑') parser.add_argument('--save_path', default='generated/', type=str, required=False, help='存放生成的檔的路徑') parser.add_argument('--articles_per_title', default=5, type=int, required=False, help='每個標題生成多少篇文章') parser.add_argument('--titles', default='蕭炎', type=str, required=False, help='標題清單,是一個字串,用空格分開') parser.add_argument('--titles_file', default='', type=str, required=False, help='標題列表檔,檔中每行一個標題。如果這個選項有值則titles無效') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切詞') parser.add_argument('--segment', action='store_true', help='中文以詞為單位') parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False) args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此處設置程式使用哪些顯卡 length = args.length temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty titles = args.titles.split() # 清單,裡面每個元素是一個生成的標題 if args.titles_file: with open(args.titles_file, 'r') as f: titles = [line.strip('\n') for line in f.readlines()] articles_per_title = args.articles_per_title # 這裡定義一個標題生成多少篇文章 save_path = args.save_path # 設置存到哪 device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() n_ctx = model.config.n_ctx if not os.path.exists(save_path): os.mkdir(save_path) if length == -1: length = model.config.n_ctx for i, title in enumerate(titles): for j in range(articles_per_title): with open(save_path + str(i) + '-' + str(j) + '.txt', 'w') as f: context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(title)) generated = 0 out = sample_sequence(n_ctx=n_ctx, model=model, length=length, context=context_tokens, tokenizer=tokenizer, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device) out = out.tolist()[0] generated += 1 text = tokenizer.convert_ids_to_tokens(out) for i, item in enumerate(text[:-1]): # 確保英文前後有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': text[i] = '\n' print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) text = ''.join(text).replace('##', '').strip() # text = ''.join(text.split('\n')[:-1]) print(text) f.write(text + '\n') print("=" * 80)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='設定要使用的顯卡,以逗號區隔') parser.add_argument('--model_config', type=str, required=False, help='模型參數設定檔的路徑') parser.add_argument('--tokenizer_path', type=str, required=True, help='選擇字典檔的路徑') parser.add_argument('--raw_data_path', type=str, required=True, help='訓練用語料庫的路徑') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='語料庫 Tokenized 後的存放路徑') parser.add_argument('--raw', action='store_true', help='是否已做過 Tokenization') parser.add_argument('--epochs', default=5, type=int, required=False, help='設定 Epochs') parser.add_argument('--batch_size', default=8, type=int, required=False, help='設定 Batch Size') parser.add_argument('--lr', default=3e-5, type=float, required=False, help='設定 Learning Rate') parser.add_argument('--warmup_steps', default=0.1, type=float, required=False, help='設定 Warmup Steps 的比例') parser.add_argument('--log_step', default=1, type=int, required=False, help='Loss 紀錄的間隔,必須是 Gradient Accumulation 的整數倍') parser.add_argument('--stride', default=768, type=int, required=False, help='設定訓練語料庫的窗口大小') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度累積') parser.add_argument('--fp16', action='store_true', help='是否使用半精度浮點數') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='將訓練語料庫分成多少份') parser.add_argument('--min_length', default=1, type=int, required=False, help='文章最短長度,若文章長度不足將被捨棄') parser.add_argument('--output_dir', type=str, required=True, help='模型輸出路徑') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型起始路徑') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard 輸出路徑') parser.add_argument('--segment', action='store_true', help='是否以詞為單位') parser.add_argument('--bpe_token', action='store_true', help='使用 Byte Pair Encoding') parser.add_argument('--encoder_json', default='tokenizations/encoder.json', type=str, help='encoder.json') parser.add_argument('--vocab_bpe', default='tokenizations/vocab.bpe', type=str, help='vocab.bpe') parser.add_argument('--timezone', default=8, type=int, help='手動指定時區,預設為 GMT+8') parser.add_argument('--epoch_save', default=1, type=int, help='每隔幾個 Epoch 就存一次權重') args = parser.parse_args() print(f'Arguments: {args.__repr__()}') if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert # 設定要使用的顯卡 os.environ['CUDA_VISIBLE_DEVICES'] = args.device model_config = transformers.GPT2Config.from_json_file(args.model_config) print(f'Config:\n{model_config.to_json_string()}') n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path, do_lower_case=False, do_basic_tokenize=False) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f'Using Device: {device.upper()}') raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation # 不支援半精度浮點數的顯卡不要使用 fp16 = args.fp16 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tz = args.timezone strlen = lambda n: len(str(n)) get_time = lambda: datetime.utcnow() + timedelta(hours=tz) tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 os.makedirs(output_dir, exist_ok=True) if raw: print('Building from Raw Data') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, tokenizer=full_tokenizer, min_length=min_length) if not args.pretrained_model: model = transformers.GPT2LMHeadModel(config=model_config) else: model = transformers.GPT2LMHeadModel.from_pretrained( args.pretrained_model) if torch.cuda.device_count() == 2: device_map = { 0: [0, 1, 2, 3, 4], 1: [5, 6, 7, 8, 9, 10, 11], } model.parallelize(device_map) # model.parallelize() print('Model Parallelism!') model.train() if torch.cuda.device_count() < 2: model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print(f'Number of Parameters: {num_parameters}') multi_gpu = False full_len = 0 print('Calculating Total Steps') for i in tqdm(range(num_pieces)): _fpath = os.path.join(tokenized_data_path, f'tokenized_train_{i}.txt') with open(_fpath, 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) warmup_steps = int(total_steps * warmup_steps) print(f'Total Steps: {total_steps}') optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( 'Please install apex from https://www.github.com/nvidia/apex to use fp16 training.' ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) # if torch.cuda.device_count() > 1: # print(f'Using {torch.cuda.device_count()} GPUs') # model = DataParallel( # model, device_ids=[int(i) for i in args.device.split(',')]) # model.to(f'cuda:{model.device_ids[0]}') # multi_gpu = True with TimeCost('Training'): print('Training Begin') overall_step = 0 running_loss = 0 for epoch in range(epochs): now = get_time() print(f'Epoch {epoch + 1} - Time: {now}') x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: _fpath = os.path.join(tokenized_data_path, f'tokenized_train_{i}.txt') with open(_fpath, 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride if start_point < len(tokens): idx = len(tokens) - n_ctx samples.append(tokens[idx:]) print(f'Tokenize {i} Sample Size: {len(samples)}') random.shuffle(samples) # 捨棄最後一個不足一個完整 Batch 的 Step _steps = len(samples) // batch_size # 若 Samples 數量小於 Batch Size 會發生完全沒有 Steps 可以訓練的問題 # 不要把 num_pieces 設定的太大,也可以解決這個問題 _steps = 1 if _steps <= 0 else _steps for step in range(_steps): # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) _device = 'cuda:0' if torch.cuda.device_count( ) > 1 else device batch_inputs = torch.tensor(batch_inputs).long().to( _device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, _ = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar( 'loss', loss.item() * gradient_accumulation, overall_step) ts = datetime.utcnow() + timedelta(hours=8) ts = ts.strftime('%H:%M:%S') display_loss = running_loss * gradient_accumulation display_loss /= log_step / gradient_accumulation print( f'Time {ts} - ' f'Epoch {epoch + 1:{strlen(epochs)}d}/{epochs} - ' f'Step {step + 1:{strlen(_steps)}d}/{_steps} - ' f'Piece {piece_num + 1:{strlen(num_pieces)}d}/{num_pieces} - ' f'Loss {display_loss:.4f}') running_loss = 0 overall_step += 1 piece_num += 1 if (epoch + 1) % args.epoch_save == 0: print(f'Saving Model of Epoch {epoch + 1}') model_output_dir = os.path.join(output_dir, f'model_epoch{epoch + 1}') os.makedirs(model_output_dir, exist_ok=True) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_output_dir) then = get_time() print(f'Epoch {epoch + 1} Finished - Time: {then}') delta = (then - now).total_seconds() mm, ss = delta // 60, delta % 60 hh, mm = mm // 60, mm % 60 print( f'Time Cost of the Epoch {epoch + 1} - {hh:.0f}:{mm:.0f}:{ss:.2f}' ) print('Training Done') model_output_dir = os.path.join(output_dir, 'final_model') os.makedirs(model_output_dir, exist_ok=True) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_output_dir)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces output_dir = args.output_dir if raw: print('building files') build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, num_pieces=num_pieces) print('files built') if not args.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) random.shuffle(samples) for step in range(len(samples) // batch_size): # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (step + 1) % log_step == 0: print( 'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}' .format( datetime.now().hour, datetime.now().minute, (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss * gradient_accumulation / log_step)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='gpt2/config.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=100, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=1, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=10000, type=int, required=False, help='warm up步数') # parser.add_argument('--log_step', default=2, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=1, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model_classfier/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.GPT2Config.from_json_file(args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda:0' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path log_step = args.log_step gradient_accumulation = args.gradient_accumulation output_dir = args.output_dir assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if True: print('building files') resources,resources_id,input_question_list, max_aq_len = build_files(data_path=raw_data_path, full_tokenizer=full_tokenizer) print('files built') input_ids = [] * len(resources_id) # labels = [] for i in range(len(resources_id)): inputsss,_ = sliding_window( max_len = 512, resources = resources_id[i], stride=384) input_ids.append(inputsss) # labels = labels + [choices['label']] * len(inputsss) print('sliding built') val_input_ids = input_ids val_input_question_list = input_question_list # if False: # shuffle # index = [i for i in range(len(token_type_ids))] # random.shuffle(index) # new_input_ids = [input_ids[i] for i in index] # new_labels = [labels[i] for i in index] # new_token_type_ids = [token_type_ids[i] for i in index] # input_ids = new_input_ids # labels = new_labels # token_type_ids = new_token_type_ids # train_dataset = my_dataset(x=input_ids, y=labels, token_type_ids=token_type_ids) # train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1) # if not args.pretrained_model: # model = transformers.models.gpt2.GPT2LMHeadModel(config=model_config) # else: # model = transformers.models.gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model) model = modelMy(args,device) model.load_pretrained(output_dir + 'loss.best') model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) print('calculating total steps') model.eval() pred_list = [] for stepp in range(len(val_input_ids)): batch_inputs = val_input_ids[stepp] batch_inputs = torch.tensor(batch_inputs).long().to(device).unsqueeze(0) batch_questions = [z['Question_token'] for z in val_input_question_list[stepp][:]] batch_questions = torch.tensor(batch_questions).long().to(device).unsqueeze(0) batch_choices = [z['Choices_token'] for z in val_input_question_list[stepp][:]] batch_choices = torch.tensor(batch_choices).long().to(device).unsqueeze(0) # forward pass outputs = model.forward(inputs=batch_inputs, questions=batch_questions, choices=batch_choices, labels=None,training=False) pred = outputs pred_list = pred_list + pred.squeeze(0).tolist() f = open('data/answer.data', 'wb') # 将文件中的变量加载到当前工作区 pickle.dump(pred_list, f) f.close() import csv headers = ['id', 'label'] rows = [] n_id = 101 for choice in pred_list: if choice == 0: charr = 'A' if choice == 1: charr = 'B' if choice == 2: charr = 'C' if choice == 3: charr = 'D' rows.append([n_id, charr]) with open('submit.csv', 'w')as f: f_csv = csv.writer(f) f_csv.writerow(headers) f_csv.writerows(rows)