def __init__(self, args, debug_mode=False): if args.no_wordpiece: from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert elif args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 self.model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) self.n_ctx = 512 self.full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) # self.full_tokenizer.max_len = self.n_ctx self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.raw_data_path = args.raw_data_path self.tokenized_data_path = args.tokenized_data_path self.do_tokenize = args.do_tokenize # 选择是否从零开始构建数据集 self.epochs = args.epochs self.batch_size = args.batch_size self.lr = args.lr self.warmup_steps = args.warmup_steps self.log_step = args.log_step self.stride = args.stride self.gradient_accumulation = args.gradient_accumulation self.fp16 = args.fp16 # 不支持半精度的显卡请勿打开 self.fp16_opt_level = args.fp16_opt_level self.max_grad_norm = args.max_grad_norm self.split_num = args.split_num self.min_length = args.min_length self.output_dir = args.output_dir self.pretrained_model = args.pretrained_model self.accumulation_steps = args.accumulation_steps self.debug_mode = debug_mode if os.path.exists("/Volumes/移动硬盘/"): self.wiki_dir = "/Volumes/移动硬盘/语料/1.中文维基" self.thu_news_dir = "/Users/hedongfeng/Desktop/下载/THUCNews" self.zhihu_path = "/Volumes/移动硬盘/语料/4.社区问答/web_text_zh_valid.json" self.baike_path = "/Volumes/移动硬盘/语料/3.百科问答/baike_qa_train.json" self.news_path = "/Volumes/移动硬盘/语料/2.新闻语料/news2016zh_train.json" else: self.wiki_dir = "/root/text_generation/data/wiki_zh" self.thu_news_dir = "/root/text_generation/data/THUCNews" self.f_log = open("train_log.txt", "w")
def __init__(self, args, debug_mode=False): if args.no_wordpiece: from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert elif args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 self.model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) self.n_ctx = self.model_config.n_ctx self.full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) self.full_tokenizer.max_len = self.n_ctx self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.raw_data_path = args.raw_data_path self.tokenized_data_path = args.tokenized_data_path self.raw = args.raw # 选择是否从零开始构建数据集 self.epochs = args.epochs self.batch_size = args.batch_size self.lr = args.lr self.warmup_steps = args.warmup_steps self.log_step = args.log_step self.stride = args.stride self.gradient_accumulation = args.gradient_accumulation self.fp16 = args.fp16 # 不支持半精度的显卡请勿打开 self.fp16_opt_level = args.fp16_opt_level self.max_grad_norm = args.max_grad_norm self.num_pieces = args.num_pieces self.min_length = args.min_length self.output_dir = args.output_dir self.pretrained_model = args.pretrained_model self.accumulation_steps = args.accumulation_steps # self.tb_writer = SummaryWriter(log_dir=args.writer_dir) self.debug_mode = debug_mode self.keywords_max_length = 64 self.passage_max_length = 512 self.passage_min_length = 128 self.f_log = open("train_log.txt", "w")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度,越高越随机') parser.add_argument('--topk', default=8, type=int, required=False, help='生成的时候最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='生成的时候积累概率最高多少') parser.add_argument('--model_config', default='config/model_config.json', type=str, required=False, help='模型参数路径') parser.add_argument('--tokenizer_path', default='cache/bud_vocab.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model_bud/', type=str, required=False, help='模型路径') parser.add_argument('--save_path', default='generated/', type=str, required=False, help='存放生成的文件的路径') parser.add_argument('--articles_per_title', default=5, type=int, required=False, help='每个标题生成多少篇文章') parser.add_argument('--titles', default='萧炎', type=str, required=False, help='标题列表,是一个字符串,用空格分开') parser.add_argument('--titles_file', default='', type=str, required=False, help='标题列表文件,文件中每行一个标题。如果这个选项有值则titles无效') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert elif args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length temperature = args.temperature topk = args.topk topp = args.topp titles = args.title.split() # 列表,里面每个元素是一个生成的标题 if args.titles_file: with open(args.titles_file, 'r') as f: titles = [line.strip('\n') for line in f.readlines()] articles_per_title = args.articles_per_title # 这里定义一个标题生成多少篇文章 save_path = args.save_path # 设置存到哪 device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model_config = pytorch_transformers.GPT2Config.from_json_file( args.model_config) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() if not os.path.exists(save_path): os.mkdir(save_path) if length == -1: length = model.config.n_ctx // 2 elif length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) for i, title in enumerate(titles): for j in range(articles_per_title): with open(save_path + str(i * j), 'w') as f: context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(title)) generated = 0 out = sample_sequence(model=model, length=length, context=context_tokens, temperature=temperature, top_k=topk, top_p=topp, device=device) out = out.tolist() generated += 1 text = tokenizer.convert_ids_to_tokens(out[0]) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': text[i] = '\n' print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) text = ''.join(text).replace('##', '').strip() # text = ''.join(text.split('\n')[:-1]) print(text) f.write(text) print("=" * 80)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='生成设备') parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度') parser.add_argument('--batch_size', default=1, type=int, required=False, help='生成的batch size') parser.add_argument('--nsamples', default=10, type=int, required=False, help='生成几个样本') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度') parser.add_argument('--topk', default=8, type=int, required=False, help='最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径') parser.add_argument('--prefix', default='萧炎', type=str, required=False, help='生成文章的开头') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert elif args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() if length == -1: length = model.config.n_ctx // 2 elif length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) while True: raw_text = args.prefix context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(raw_text)) generated = 0 for _ in range(nsamples // batch_size): out = sample_sequence(model=model, length=length, context=context_tokens, temperature=temperature, top_k=topk, top_p=topp, device=device) out = out.tolist() for i in range(batch_size): generated += 1 text = tokenizer.convert_ids_to_tokens(out[0]) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': text[i] = '\n' print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) text = ''.join(text).replace('##', '').strip() print(text) print("=" * 80)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='生成设备') parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度') parser.add_argument('--batch_size', default=1, type=int, required=False, help='生成的batch size') parser.add_argument('--nsamples', default=10, type=int, required=False, help='生成几个样本') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度') parser.add_argument('--topk', default=8, type=int, required=False, help='最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径') parser.add_argument('--prefix', default='萧炎', type=str, required=False, help='生成文章的开头') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--fast_pattern', action='store_true', help='采用更加快的方式生成文本') parser.add_argument('--save_samples', action='store_true', help='保存产生的样本') parser.add_argument('--save_samples_path', default='.', type=str, required=False, help="保存样本的路径") parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False) args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert elif args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() n_ctx = model.config.n_ctx if length == -1: length = model.config.n_ctx if args.save_samples: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/samples.txt', 'w', encoding='utf8') while True: raw_text = args.prefix context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(raw_text)) generated = 0 for _ in range(nsamples // batch_size): out = generate(n_ctx=n_ctx, model=model, context=context_tokens, length=length, is_fast_pattern=args.fast_pattern, temperature=temperature, top_k=topk, top_p=topp, repitition_penalty=repetition_penalty, device=device) for i in range(batch_size): generated += 1 text = tokenizer.convert_ids_to_tokens(out) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' for i, item in enumerate(text): if item == '[MASK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': text[i] = '\n' info = "=" * 40 + " SAMPLE " + str( generated) + " " + "=" * 40 + "\n" print(info) text = ''.join(text).replace('##', '').strip() print(text) if args.save_samples: samples_file.write(info) samples_file.write(text) samples_file.write('\n') samples_file.write('=' * 90) samples_file.write('\n' * 2) print("=" * 80) if generated == nsamples: # close file when finish writing. if args.save_samples: samples_file.close() break
def ai_kg(text='', length=100, nsamples=5): """ 这里生成kg """ print("运行知识提取任务") parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='生成设备') parser.add_argument('--length', default=length, type=int, required=False, help='生成长度') parser.add_argument('--batch_size', default=1, type=int, required=False, help='生成的batch size') parser.add_argument('--nsamples', default=nsamples, type=int, required=False, help='生成几个样本') parser.add_argument('--temperature', default=0.7, type=float, required=False, help='生成温度') parser.add_argument('--topk', default=10, type=int, required=False, help='最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/kg', type=str, required=False, help='模型路径') parser.add_argument('--prefix', default=text, type=str, required=False, help='生成文章的开头') parser.add_argument('--remove_prefix', default=True, required=False, help='生成文章的开头') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--fast_pattern', default=True, action='store_true', help='采用更加快的方式生成文本') parser.add_argument('--save_samples', action='store_true', help='保存产生的样本') parser.add_argument('--save_samples_path', default='.', type=str, required=False, help="保存样本的路径") parser.add_argument('--tid', default='0', type=str, required=False, help='保存生成内容') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert elif args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.to(device) model.eval() if length == -1: length = model.config.n_ctx - len(args.prefix) elif length > model.config.n_ctx - len(args.prefix): # raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) # raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) print("输入内容过长自动裁切,方便生成足够数据") args.prefix = args.prefix[-(model.config.n_ctx - args.length):] if args.save_samples: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/samples.txt', 'w', encoding='utf8') while True: # raw_text = args.prefix raw_text = tkit.Text().clear(args.prefix + '') context_tokens = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(raw_text)) generated = 0 all_text = [] for _ in range(nsamples // batch_size): out = generate(model=model, context=context_tokens, length=length, is_fast_pattern=args.fast_pattern, temperature=temperature, top_k=topk, top_p=topp, device=device) for i in range(batch_size): generated += 1 text = tokenizer.convert_ids_to_tokens(out) for i, item in enumerate(text[:-1]): # 确保英文前后有空格 if is_word(item) and is_word(text[i + 1]): text[i] = item + ' ' kgs = [] for i, item in enumerate(text): # print(text[i]) if item == '[MASK]': text[i] = '' if item == '[CLS]' or item == '[SEP]': # print('缓存') text[i] = '\n' info = "=" * 40 + " SAMPLE " + str( generated) + " " + "=" * 40 + "\n" # print(info) text = ''.join(text).replace('##', '').strip() # print(text) # text = ''.join(text).replace('##', '').strip() kg_start = '[kgs]' kg_end = '[kge]' kg_start_n = text.index(kg_start) kg_end_n = text.index(kg_end) if kg_start_n >= 0 and kg_end_n >= 0: text = text[(kg_start_n + 5):kg_end_n] text = text.replace('[/kg]', '||').replace('[kg]', '').replace('[kge]', '').strip() print(text) # if args.remove_prefix: # # remove_prefix_length =len(context_tokens) # # print(remove_prefix_length) # # text=text[remove_prefix_length:] # # prefix_clean =tkit.Text().clear(args.prefix) # # print('raw_text',raw_text) # text=text.replace(raw_text,'') if text in all_text: pass else: all_text.append(text) if args.save_samples: samples_file.write(info) samples_file.write(text) samples_file.write('\n') samples_file.write('=' * 90) samples_file.write('\n' * 2) print("=" * 80) del model gc.collect() for x in locals().keys(): # print("清理函数内存",locals()[x]) del locals()[x] gc.collect() #保存生成的数据 tkit.File().mkdir('tmp') data_path = "tmp/run_task" + args.tid + ".json" print('保存生成', data_path) tjson = tkit.Json(file_path=data_path) tjson.save([{'prefix': args.prefix, 'data': all_text}]) return all_text if generated == nsamples: # close file when finish writing. if args.save_samples: samples_file.close() break
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=str, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert elif args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tb_writer = SummaryWriter(log_dir=args.writer_dir) if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') if not args.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride start_point -= stride last = tokens[start_point + n_ctx:] last.extend([ full_tokenizer.convert_tokens_to_ids(['[PAD]']) * (n_ctx - len(last)) ]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() overall_step += 1 if (overall_step + 1) % log_step == 0: tb_writer.add_scalar('loss', loss.item(), overall_step) if (overall_step + 1) % log_step == 0: print( 'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}' .format( datetime.now().hour, datetime.now().minute, (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss * gradient_accumulation / log_step)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='生成设备') parser.add_argument('--length', default=-1, type=int, required=False, help='生成长度') parser.add_argument('--batch_size', default=128, type=int, required=False, help='生成的batch size') parser.add_argument('--nsamples', default=10, type=int, required=False, help='生成几个样本') parser.add_argument('--temperature', default=1, type=float, required=False, help='生成温度') parser.add_argument('--topk', default=8, type=int, required=False, help='最高几选一') parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率') parser.add_argument('--model_config', default='model/final_model/config.json', type=str, required=False, help='模型参数') # use the vocab.txt parser.add_argument('--tokenizer_path', default='cache/vocab.txt', type=str, required=False, help='词表路径') parser.add_argument('--model_path', default='model/final_model', type=str, required=False, help='模型路径') parser.add_argument('--prefix', default='<s>', type=str, required=False, help='生成文章的开头') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--fast_pattern', action='store_true', help='采用更加快的方式生成文本') parser.add_argument('--save_samples', action='store_true', help='保存产生的样本') parser.add_argument('--save_samples_path', default='.', type=str, required=False, help="保存样本的路径") parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False) args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert elif args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert # os.environ["CUDA_VISIBLE_DEVICES"] = args.device length = args.length batch_size = args.batch_size nsamples = args.nsamples temperature = args.temperature topk = args.topk topp = args.topp repetition_penalty = args.repetition_penalty device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model.cuda() model.eval() t = tool(model, tokenizer, maxlen=15) folder, sss = 'xiaohuangji', 256 # load the dialog data src, tgt = load_txt(f'./data/{folder}/src-train.txt'), load_txt( f'./data/{folder}/tgt-train.txt') # sample 256 sentence from target dataset that is very short tgtlength = [-len(i) if len(i) <= 8 else -np.inf for i in tgt] pp = torch.softmax(torch.tensor(tgtlength, dtype=torch.float), dim=0).numpy() sidx = np.random.choice(list(range(len(tgt))), sss, p=pp) ptgt = [] for i in sidx: ptgt.append(tgt[i]) tgt = ptgt sl, tl = [len(i) for i in src], [len(i) for i in tgt] print( f'[!] sl avg length: {round(np.mean(sl), 4)}, tl avg length: {round(np.mean(tl), 4)}' ) # compute the weight matrix: [45000, 45000] pt_matrix = np.zeros([len(tgt)]) # [m] pts_matrix = np.zeros([len(src), len(tgt)]) # [n, m] pst_matrix = np.zeros([len(tgt), len(src)]) # [m, n] print(f'[!] pt matrix shape: {pt_matrix.shape}') print(f'[!] pts matrix shape: {pts_matrix.shape}') print(f'[!] pst matrix shape: {pst_matrix.shape}') # pt, the possibility of each sentence (128) for i in tqdm(range(0, len(tgt), batch_size)): batch = tgt[i:i + batch_size] pt_matrix[i:i + batch_size] = t.p_t_s( batch, context=None, batch_size=len(batch)).cpu().numpy() # pts, p(t|s) the samples in the batch have the same context for i in tqdm(range(len(src))): for j in range(0, len(tgt), batch_size): batch, context = tgt[j:j + batch_size], src[i] pts_matrix[i, j:j + batch_size] = t.p_t_s( batch, context=context, batch_size=len(batch)).cpu().numpy() # pst, p(s|t) the reverse possibility computing for i in tqdm(range(len(tgt))): for j in range(0, len(src), batch_size): batch, context = src[j:j + batch_size], tgt[i] pst_matrix[i, j:j + batch_size] = t.p_t_s( batch, context=context, batch_size=len(batch)).cpu().numpy() with open(f'./data/{folder}/PT.pkl', 'wb') as f: pickle.dump(pt_matrix, f) with open(f'./data/{folder}/PTS.pkl', 'wb') as f: pickle.dump(pts_matrix, f) with open(f'./data/{folder}/PST.pkl', 'wb') as f: pickle.dump(pst_matrix, f) pt_matrix = normalization(np.log(pt_matrix).reshape(1, -1)).reshape(-1) pts_matrix = normalization(np.log(pts_matrix)) pst_matrix = normalization(np.log(pst_matrix).T) # ignore the zero pt_matrix += 1e-20 pts_matrix += 1e-20 pst_matrix += 1e-20 SRF = 2 * pst_matrix / (pt_matrix + pts_matrix) # fix the 128 wrong case in the SRF matrix for i in range(len(sidx)): SRF[sidx[i], i] = -np.inf # ban it with open(f'./data/{folder}/SRF.pkl', 'wb') as f: pickle.dump([sidx, SRF], f) print(f'[!] save file into ./data/{folder}/SRF_matrix, shape: {SRF.shape}')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/eval.json', type=str, required=False, help='原始语料') parser.add_argument('--tokenized_data_path', default='data/tokenized_eval/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--batch_size', default=8, type=int, required=False, help='batch size') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次') parser.add_argument('--stride', default=768, type=int, required=False, help='取数据的窗口步长') parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型起点路径') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--output_dir', default='eval_result/', type=str, required=False, help='结果输出路径') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 batch_size = args.batch_size log_step = args.log_step stride = args.stride num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') if not args.pretrained_model: print('you need to specify a trained model.') exit(1) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.eval() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') overall_step = 0 total_loss = 0 total_steps = 0 # eval now = datetime.now() print('time: {}'.format(now)) piece_num = 0 for i in range(num_pieces): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride start_point -= stride last = tokens[start_point + n_ctx:] last.extend([ full_tokenizer.convert_tokens_to_ids(['[PAD]']) * (n_ctx - len(last)) ]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() total_loss += loss total_steps += 1 if (overall_step + 1) % log_step == 0: print('now time: {}:{}. Step {} of piece {}, ppl {}'.format( datetime.now().hour, datetime.now().minute, (step + 1), piece_num, torch.exp(loss))) piece_num += 1 if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) else: with open(args.output_dir + 'result.txt', 'w') as f: f.write(np.exp(total_loss / total_steps))