def __init__(self): try: self.device = 'cuda' if config_model.use_cuda else 'cpu' LOGGER.info('using device: {}'.format(self.device)) if self.device == 'cuda': os.environ["CUDA_VISIBLE_DEVICES"] = config_model.device_nums self.tokenizer = BertTokenizer(config_model.vocab_path) # dialogue model self.dialogue_model = GPT2LMHeadModel.from_pretrained(config_model.dialogue_model_path) self.dialogue_model.to(self.device) self.dialogue_model.eval() # mmi model self.mmi_model = GPT2LMHeadModel.from_pretrained(config_model.mmi_model_path) self.mmi_model.to(self.device) self.dialogue_model.eval() self.max_sequence_len = config_model.max_len self.batch_size = config_model.batch_size self.repetition_penalty = config_model.repetition_penalty self.temperature = config_model.temperature self.debug = config_model.debug self.topk = config_model.topk self.topp = config_model.topp except Exception as e: LOGGER.error("FAIL INIT: {}".format(str(e))) traceback.print_exc() sys.exit(-1)
def create_model(pre_trained=False, mmi=False): if pre_trained: if mmi: model = GPT2LMHeadModel.from_pretrained(config.DIALOGUE_MODEL_PATH) else: model = GPT2LMHeadModel.from_pretrained(config.MMI_MODEL_PATH) else: model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( config.CONFIG_JSON_FILE) model = GPT2LMHeadModel(config=model_config) # model.resize_token_embeddings(vocab_size) n_ctx = model.config.to_dict().get("n_ctx") return model, n_ctx
def main(): args = set_interact_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizer(vocab_file=args.voca_path) # 对话model dialogue_model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) dialogue_model.to(device) dialogue_model.eval() # 互信息mmi model mmi_model = GPT2LMHeadModel.from_pretrained(args.mmi_model_path) mmi_model.to(device) mmi_model.eval() if args.save_samples_path: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/mmi_samples.txt', 'a', encoding='utf8') samples_file.write("聊天记录{}:\n".format(datetime.now())) # 存储聊天记录,每个utterance以token的id的形式进行存储 history = [] print('开始和chatbot聊天,输入CTRL + Z以退出') @app.route('/message', methods=['POST']) def reply(): # time.sleep(2) input_msg = request.form.get('msg', None) fromGroup = request.form.get('group', None) fromQQ = request.form.get('qq', None) if not input_msg: return RepeaterResult(-1).toJSON() print(fromQQ + "(群" + fromGroup + "): ", input_msg) output = generate_reply("这是测试话", args, device, dialogue_model, history, mmi_model, samples_file, tokenizer) output = ''.join(map(str, output)) return RepeaterResult(0, output, "0").toJSON() # ===================循环==================== app.run(port=7777, debug=True)
def gpt2LMHeadModel(*args, **kwargs): """ gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the tied (pre-trained) language modeling head on top. Example: # Load the tokenizer >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2') # Prepare tokenized input >>> text_1 = "Who was Jim Henson ?" >>> text_2 = "Jim Henson was a puppeteer" >>> indexed_tokens_1 = tokenizer.encode(text_1) >>> indexed_tokens_2 = tokenizer.encode(text_2) >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1]) >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2]) # Load gpt2LMHeadModel >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2') >>> model.eval() # Predict hidden states features for each layer # past can be used to reuse precomputed hidden state in a subsequent predictions >>> with torch.no_grad(): predictions_1, past = model(tokens_tensor_1) predictions_2, past = model(tokens_tensor_2, past=past) # Get the predicted last token >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item() >>> predicted_token = tokenizer.decode([predicted_index]) >>> assert predicted_token == ' who' """ model = GPT2LMHeadModel.from_pretrained(*args, **kwargs) return model
def __init__(self, discrim: str, seed=0, **kwargs): # Set random seed torch.manual_seed(seed) np.random.seed(seed) pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ "pretrained_model"] print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model)) # load pretrained model model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True) model.eval() # load tokenizer tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) # Freeze GPT-2 weights for param in model.parameters(): param.requires_grad = False super().__init__(model=model, tokenizer=tokenizer, **kwargs) # Additional setup after creating model and tokenizer self.discrim = discrim classifier = get_classifier(self.discrim, self.device) self.classifier = classifier
def main(): args = set_interact_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda # args.cuda = False device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizer(vocab_file=args.voca_path) model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) model.to(device) model.eval() print('***********************Summary model start************************') while True: try: text = input() for i in range(5): if len(text): text = text[:1000] input_ids = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 input_ids.extend(tokenizer.encode(text)) input_ids.append(tokenizer.sep_token_id) curr_input_tensor = torch.tensor(input_ids).long().to(device) generated = [] # 最多生成max_len个token for _ in range(args.max_len): outputs = model(input_ids=curr_input_tensor) next_token_logits = outputs[0][-1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for id in set(generated): next_token_logits[id] /= args.repetition_penalty next_token_logits = next_token_logits / args.temperature # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token next_token_logits[tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if next_token == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break generated.append(next_token.item()) curr_input_tensor = torch.cat( (curr_input_tensor, next_token), dim=0) text = tokenizer.convert_ids_to_tokens(generated) print("summary:" + "".join(text)) except KeyboardInterrupt: break
def get_model(name): tokenizer = GPT2Tokenizer.from_pretrained(name) model = GPT2LMHeadModel.from_pretrained(name, output_hidden_states=True) for param in model.parameters(): param.requires_grad = False model.to(device) model.eval() return model, tokenizer
def __init__(self, device='cuda', history_len=1, batch_size=5, max_len=25, penalty=1.0, temperature=1): super(ConversationHandler, self).__init__() self.device = device self.max_len = max_len # max length of each utterance self.history_len = history_len self.config = ConfigParser.config_dict['conversation'] self.tokenizer = BertTokenizer(vocab_file=self.config['voca_path']) self.hanlp = HanlpWrapper() # not going to inited twice since it's a Singleton self.model = GPT2LMHeadModel.from_pretrained(self.config['dialogue_model']) self.mmi_model = GPT2LMHeadModel.from_pretrained(self.config['mmi_model']) # move both models to specific device self.model.to(device) self.model.eval() self.mmi_model.to(device) self.mmi_model.eval() self.history = [] # for future multi-conversation usage self.batch_size = batch_size # how many response generated for MMI filter self.penalty = penalty self.temperature = temperature
def create_model(hparams, vocab_size): if hparams.pretrained_model: # 如果指定了预训练的GPT2模型 model = GPT2LMHeadModel.from_pretrained(hparams.pretrained_model) else: # 若没有指定预训练模型,则初始化模型 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( hparams.model_config) model = GPT2LMHeadModel(config=model_config) # 根据tokenizer的vocabulary调整GPT2模型的vocab的大小 model.resize_token_embeddings(vocab_size) logger.info('model config:\n{}'.format(model.config.to_json_string())) return model, model.config.to_dict().get("n_ctx")
def _get_dialogue_model(self): logging.info("Start getting dialogue model.") args = self.args dialogue_model = GPT2LMHeadModel.from_pretrained( args.dialogue_model_path) dialogue_model.to(self.device) dialogue_model.eval() logging.info("Finish reading dialogue model.") return dialogue_model
def _get_mmi_model(self): args = self.args try: logging.info("Start getting mmi model.") mmi_model = GPT2LMHeadModel.from_pretrained(args.mmi_model_path) mmi_model.to(self.device) mmi_model.eval() logging.info("Finish getting mmi model.") return mmi_model except: logging.info( 'Cannot find mmi model in directory, we will choose response randomly.' ) return None
def build_model(self): """ :param args: :param vocab_size:字典大小 :return: """ if self.args.pretrained_model: # 如果指定了预训练的GPT2模型 self.model = GPT2LMHeadModel.from_pretrained( self.args.pretrained_model) else: # 若没有指定预训练模型,则初始化模型 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( self.args.model_config) self.model = GPT2LMHeadModel(config=model_config) # 根据tokenizer的vocabulary调整GPT2模型的voca的大小 self.model.resize_token_embeddings(self.vocab_size) if self.use_cuda: self.model.to(self.device) self.logger.info('model config:\n{}'.format( self.model.config.to_json_string())) self.n_ctx = self.model.config.to_dict().get("n_ctx") # 建立模型存储路径 if self.args.is_model_output and not os.path.exists( self.args.dialogue_model_output_path): os.mkdir(self.args.dialogue_model_output_path) # 记录模型参数数量 num_parameters = 0 parameters = self.model.parameters() for parameter in parameters: num_parameters += parameter.numel() self.logger.info( 'number of model parameters: {}'.format(num_parameters)) # 是否使用多块GPU进行并行运算 if self.args.use_multi_gpu: if self.args.use_cuda and torch.cuda.device_count() > 1: self.logger.info("Let's use GPUs to train") self.model = DataParallel( self.model, device_ids=[int(i) for i in self.args.device.split(',')]) else: self.args.use_multi_gpu = False
def main(): model_args, training_args = parse_args() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # 创建模型 model_config = GPT2Config.from_json_file(model_args.model_config_file) if not model_args.pretrained_model_path: model = GPT2LMHeadModel(config=model_config) else: model = GPT2LMHeadModel.from_pretrained( model_args.pretrained_model_path) # 计算参数数量 num_parameters = 0 for parameter in model.parameters(): num_parameters += parameter.numel() logger.info('number of parameters: {}'.format(num_parameters)) full_tokenizer = get_tokenizer(vocab_file=model_args.vocab_file) # 输入集 train_dataset = GPT2Dataset(model_config.n_ctx, stride=model_args.stride, tokenized_file_path=model_args.data_dir, tokenizer=full_tokenizer) trainer = MyTrainer(model=model, args=training_args, train_dataset=train_dataset) # 开始训练 trainer.train(model_path=model_args.pretrained_model_path) trainer.save_model()
def create_model(args, vocab_size): """ :param args: :param vocab_size:字典大小 :return: """ if args.pretrained_model: # 如果指定了预训练的GPT2模型 model = GPT2LMHeadModel.from_pretrained(args.pretrained_model) else: # 若没有指定预训练模型,则初始化模型 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config=model_config) # 根据tokenizer的vocabulary调整GPT2模型的voca的大小 model.resize_token_embeddings(vocab_size) logger.info('model config:\n{}'.format(model.config.to_json_string())) return model, model_config.n_ctx
def load_lang_model(self): print(f"Loading language model {self.pretrained_model}") # load pretrained model self.lang_model = GPT2LMHeadModel.from_pretrained( self.pretrained_model, output_hidden_states=True ) self.lang_model.to(self.device) self.lang_model.eval() # load tokenizer self.tokenizer = GPT2Tokenizer.from_pretrained(self.pretrained_model) # Freeze GPT-2 weights for param in self.lang_model.parameters(): param.requires_grad = False
def create_model(vocab_size): ''' 创建模型 :return: ''' if Config.pretrained_model: # 若有预训练模型 则加载 model = GPT2LMHeadModel.from_pretrained(Config.pretrained_model) else: # 若没有预训练模型 则创建模型 从头训练起 model_config = modeling_gpt2.GPT2Config.from_json_file( Config.gpt2_config) model = GPT2LMHeadModel(config=model_config) # 根据tokenizer的vocabulary调整GPT2模型的vocab的大小 model.resize_token_embeddings(vocab_size) return model, model.config.to_dict().get("n_ctx") # 输入长度
def initialize(np,torch): from transformers import GPT2Tokenizer from transformers.modeling_gpt2 import GPT2LMHeadModel import PPLM.run_pplm as PPLM torch.manual_seed(0) np.random.seed(0) model = GPT2LMHeadModel.from_pretrained( "gpt2-medium", output_hidden_states=True ) model.to("cpu") model.eval() for param in model.parameters(): param.requires_grad = False tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") return tokenizer,model,PPLM
def sumarize(content): tokenizer = BertTokenizer.from_pretrained('./vocab') model = GPT2LMHeadModel.from_pretrained('./model.pt') model.to(device) model.eval() for i in range(3): # 对新闻正文进行预处理,并判断如果超长则进行截断 content_tokens = tokenizer.tokenize(content) if len(content_tokens) > max_len - 3 - generate_max_len: content_tokens = content_tokens[:max_len - 3 - generate_max_len] # 将tokens索引化,变成模型所需格式 content_tokens = ["[CLS]"] + content_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(content_tokens) # 将input_ids变成tensor curr_input_tensor = torch.tensor(input_ids).long().to(device) generated = [] # 最多生成generate_max_len个token for _ in range(generate_max_len): outputs = model(input_ids=curr_input_tensor) next_token_logits = outputs[0][-1, :] #size:[vocab size] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for id_ in set(generated): next_token_logits[id_] /= repetition_penalty # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token next_token_logits[tokenizer.convert_tokens_to_ids('[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=topk, top_p=topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if next_token.item() == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break generated.append(next_token.item()) curr_input_tensor = torch.cat((curr_input_tensor, next_token), dim=0) text = tokenizer.convert_ids_to_tokens(generated) print("summary:" + "".join(text))
def main(): args = set_args() # 设置使用哪些显卡进行训练 os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu global logger logger = create_logger(args) logger.info('using device:{}'.format(args.device)) logger.info('Initilizing tokenizer ...') tokenizer = BertTokenizer.from_pretrained(args.vocab_path) vocab_size = len(tokenizer) logger.info('Loading pretrained model ...') model = GPT2LMHeadModel.from_pretrained(args.vocab_path) n_ctx = model.config.to_dict().get('n_ctx') # 获取模型规定的序列长度 model.resize_token_embeddings(vocab_size) # 修改预训练模型的vocab大小 model.to(args.device) logger.info('Loading data for training and evaluation ...') dataset = get_dataset(args.raw_data_path, tokenizer, n_ctx, args.token_data_path, args.train_mmi) train(model, dataset['test'], args)
def run_pplm_example( pretrained_model="gpt2-medium", cond_text="", uncond=False, num_samples=1, bag_of_words=None, discrim=None, discrim_weights=None, discrim_meta=None, class_label=-1, length=100, stepsize=0.02, temperature=1.0, top_k=10, sample=False, num_iterations=3, grad_length=10000, horizon_length=1, window_length=0, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, no_cuda=False, colorama=False, repetition_penalty=1.0, ): # set Random seed torch.manual_seed(seed) np.random.seed(seed) # set the device device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" if discrim == "generic": set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ "pretrained_model"] print("discrim = {}, pretrained_model set to discriminator's = {}". format(discrim, pretrained_model)) # load pretrained model model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True) model.to(device) model.eval() # load tokenizer tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) # Freeze GPT-2 weights for param in model.parameters(): param.requires_grad = False # figure out conditioning text if uncond: tokenized_cond_text = tokenizer.encode([tokenizer.bos_token]) else: raw_text = cond_text while not raw_text: print("Did you forget to add `--cond_text`? ") raw_text = input("Model prompt >>> ") tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text) print("= Prefix of sentence =") print(tokenizer.decode(tokenized_cond_text)) print() # generate unperturbed and perturbed texts # full_text_generation returns: # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation( model=model, tokenizer=tokenizer, context=tokenized_cond_text, device=device, num_samples=num_samples, bag_of_words=bag_of_words, discrim=discrim, class_label=class_label, length=length, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, window_length=window_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, repetition_penalty=repetition_penalty, ) # untokenize unperturbed text unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0]) print("=" * 80) print("= Unperturbed generated text =") print(unpert_gen_text) print() generated_texts = [] bow_word_ids = set() if bag_of_words and colorama: bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) # w[0] because we are sure w has only 1 item because previous fitler bow_word_ids.update(w[0] for w in filtered) # iterate through the perturbed texts for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts): try: # untokenize unperturbed text if colorama: import colorama pert_gen_text = "" for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: pert_gen_text += "{}{}{}".format( colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL, ) else: pert_gen_text += tokenizer.decode([word_id]) else: pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0]) print("= Perturbed generated text {} =".format(i + 1)) print(pert_gen_text) print() except Exception as exc: print("Ignoring error while generating perturbed text:", exc) # keep the prefix, perturbed seq, original seq for each index generated_texts.append( (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)) return
def generate_with_bow_feedback( pretrained_model="gpt2-medium", cond_text="", num_samples=1, length=100, stepsize=0.02, temperature=1.0, top_k=10, sample=True, num_iterations=3, grad_length=10000, horizon_length=1, window_length=0, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, no_cuda=False, colorama=False, verbosity='regular', strategy='base', cache_dir=None, ): if strategy == 'exp' and num_samples > 1: raise NotImplementedError( "num_samples > 1 is not yet implemented for 'exp' strategy.") # set Random seed torch.manual_seed(seed) np.random.seed(seed) # set verbosiry verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR) # set the device device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" # load pretrained model if cache_dir: model = GPT2LMHeadModel.from_pretrained( pretrained_model, cache_dir=cache_dir, output_hidden_states=True ) else: model = GPT2LMHeadModel.from_pretrained( pretrained_model, output_hidden_states=True ) model.to(device) model.eval() # load tokenizer tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) # Freeze GPT-2 weights for param in model.parameters(): param.requires_grad = False # figure out conditioning text raw_text = cond_text while not raw_text: print("Did you forget to add `--cond_text`? ") raw_text = input("Model prompt >>> ") tokenized_cond_text = tokenizer.encode( tokenizer.bos_token + raw_text, add_special_tokens=False ) logger.info("= Prefix of sentence =") logger.info(tokenizer.decode(tokenized_cond_text)) logger.info("\n") # generate unperturbed and perturbed texts # full_text_generation returns: # unpert_gen_tok_text, pert_gen_tok_texts, losses_in_time if strategy == 'base': pert_gen_tok_texts, _ = full_text_generation( model=model, cond_text=cond_text, tokenizer=tokenizer, context=tokenized_cond_text, device=device, num_samples=num_samples, length=length, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, window_length=window_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, verbosity_level=verbosity_level, generate_unpert=False ) elif strategy == 'exp': prev_length = len(tokenized_cond_text) current_length = len(tokenized_cond_text) + 1 current_cond_text = cond_text current_tokenized_cond_text = tokenized_cond_text while current_length < length: pert_gen_tok_texts, _ = full_text_generation( model=model, cond_text=current_cond_text, tokenizer=tokenizer, context=current_tokenized_cond_text, device=device, num_samples=1, length=current_length - prev_length, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, window_length=window_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, verbosity_level=verbosity_level, generate_unpert=False, ) prev_length = current_length current_length *= 2 current_length = min(current_length, length) current_cond_text = tokenizer.decode(pert_gen_tok_texts[0].tolist()[0]) current_tokenized_cond_text = tokenizer.encode( current_cond_text, add_special_tokens=False ) generated_texts = [] bow_word_ids = set() if colorama: bow_indices = get_bag_of_words_indices(cond_text, tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) # w[0] because we are sure w has only 1 item because previous fitler bow_word_ids.update(w[0] for w in filtered) # iterate through the perturbed texts for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts): try: # untokenize unperturbed text if colorama: import colorama pert_gen_text = '' for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: pert_gen_text += '{}{}{}'.format( colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL ) else: pert_gen_text += tokenizer.decode([word_id]) else: pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0]) logger.info("= Perturbed generated text {} =".format(i + 1)) logger.info(pert_gen_text) logger.info("\n") except: pass generated_texts.append( tokenizer.decode(pert_gen_tok_text.tolist()[0]) ) return generated_texts
def run_pplm_example( pretrained_model="gpt2-medium", cond_text="", uncond=False, num_samples=1, bag_of_words=None, discrim=None, discrim_weights=None, discrim_meta=None, class_label=-1, length=100, stepsize=0.02, temperature=1.0, top_k=10, sample=True, num_iterations=3, grad_length=10000, horizon_length=1, window_length=0, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, no_cuda=False, colorama=False, verbosity='regular', file=None, sample_method=PERTURBED, vad_loss_params=None, vad_threshold=0.01, ): # set Random seed torch.manual_seed(seed) np.random.seed(seed) # set verbosity verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR) # set generation method generation_method = GENERATION_METHODS.get(sample_method, PERTURBED) # set the device device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" if discrim == 'generic': set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: discriminator_pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ "pretrained_model"] if pretrained_model != discriminator_pretrained_model: pretrained_model = discriminator_pretrained_model if verbosity_level >= REGULAR: print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model)) import logging logging.basicConfig(level=logging.INFO) # load pretrained model model = GPT2LMHeadModel.from_pretrained( pretrained_model, output_hidden_states=True # passed to model's __init__ method ) model.to(device) model.eval() # load tokenizer tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) # Freeze GPT-2 weights for param in model.parameters(): param.requires_grad = False # figure out conditioning text if uncond: tokenized_cond_text = tokenizer.encode([tokenizer.bos_token], add_special_tokens=False) else: raw_text = cond_text while not raw_text: print("Did you forget to add `--cond_text`? ") raw_text = input("Model prompt >>> ") tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text, add_special_tokens=False) print("= Prefix of sentence =") print(tokenizer.decode(tokenized_cond_text)) print() # generate unperturbed and perturbed texts # full_text_generation returns: # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time # generate one GPT-2 sample and multiple PPLM samples unpert_gen_tok_text, pert_gen_tok_texts, _, _, num_changes_list = full_text_generation( model=model, tokenizer=tokenizer, context=tokenized_cond_text, device=device, num_samples=num_samples, bag_of_words=bag_of_words, discrim=discrim, class_label=class_label, length=length, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, window_length=window_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, verbosity_level=verbosity_level, file=file, generation_method=generation_method, vad_loss_params=vad_loss_params, vad_threshold=vad_threshold, ) # untokenize unperturbed text unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0]) if verbosity_level >= REGULAR: print("=" * 80) print("= Unperturbed generated text =") print(unpert_gen_text) print() generated_texts = [] bow_word_ids = set() if bag_of_words and colorama: bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) # w[0] because we are sure w has only 1 item because previous fitler bow_word_ids.update(w[0] for w in filtered) # iterate through the perturbed texts for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts): try: # untokenize unperturbed text if colorama: import colorama pert_gen_text = '' for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: pert_gen_text += '{}{}{}'.format( colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL) else: pert_gen_text += tokenizer.decode([word_id]) else: pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0]) print("= Perturbed generated text {} =".format(i + 1)) print(pert_gen_text) print() # log sample if verbosity_level >= QUIET: if verbosity_level >= REGULAR: pert_gen_text += '【{} words changed】'.format( num_changes_list[i]) file.write(pert_gen_text) except: pass # keep the prefix, perturbed seq, original seq for each index generated_texts.append( (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)) # log average changes changes_mean = stat.mean(num_changes_list) if verbosity_level >= QUIET: print('========{} words changed(mean)========'.format(changes_mean)) if verbosity_level >= REGULAR: file.write('\n========{} words changed(mean)========'.format( changes_mean)) return changes_mean
def main(): global logger args = set_interact_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizer(vocab_file=args.voca_path) model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) model.to(device) model.eval() n_ctx = model.config.to_dict().get("n_ctx") print(f'dialogue model path load: {args.dialogue_model_path}') # generate the test file # read the file and open the writable file corpus = [] print(f'========== n_ctx of the model and datasets: {n_ctx} ==========') with open(args.test_data_path) as f: for line in f.readlines(): line = line.lower() line = line.strip().replace('<user0>', '').replace('<user1>', '').replace( '__eou__', '[SEP]') corpus.append(line) fw = open(args.save_samples_path, 'w') for line in tqdm(corpus): input_ids = [tokenizer.cls_token_id ] + tokenizer.encode(line) + [tokenizer.sep_token_id] if len(input_ids) > n_ctx: curr_input_tensor = torch.tensor( [tokenizer.cls_token_id] + input_ids[-(n_ctx - 1):]).long().to(device) else: curr_input_tensor = torch.tensor(input_ids).long().to(device) generated = [] for _ in range(args.max_len): outputs = model(input_ids=curr_input_tensor) next_token_logits = outputs[0][-1, :] # for id in set(generated): # next_token_logits[id] /= args.repetition_penalty next_token_logits = next_token_logits / args.temperature next_token_logits[tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) if next_token == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break generated.append(next_token.item()) curr_input_tensor = torch.cat((curr_input_tensor, next_token), dim=0) text = tokenizer.convert_ids_to_tokens(generated) # ipdb.set_trace() text = ' '.join(text) fw.write(f'{text}\n') fw.flush() fw.close() '''
from flask import Flask, render_template, url_for, request from flask_bootstrap import Bootstrap from transformers.modeling_gpt2 import GPT2LMHeadModel # This downloads GPT-2 Medium, it takes a little while _ = GPT2LMHeadModel.from_pretrained("gpt2-medium") from run_pplm import run_pplm_example app = Flask(__name__) Bootstrap(app) # add a rule for the index page. @app.route('/') def index(): return render_template('index.html') @app.route('/get_data', methods=['POST']) def get_data(): if(request.method =='POST'): text = request.form['nlg'] drop = request.form['personality'] x = run_pplm_example(cond_text=text,num_samples=1,bag_of_words=drop,length=50,stepsize=0.03,sample=True,num_iterations=3, window_length=5,gamma=1.5,gm_scale=0.95,kl_scale=0.01,verbosity='regular') return render_template('result.html',prediction=[text,type(x)]) '''def get_data(): print("I am here!") if request.method == 'POST': text = request.form['nlg'] print(text)
def main(): args = set_interact_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizer(vocab_file=args.voca_path) model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) model.to(device) model.eval() if args.save_samples_path: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/samples.txt', 'w', encoding='utf8') samples_file.write("聊天记录{}:\n".format(datetime.now())) # 存储聊天记录,每个utterance以token的id的形式进行存储 with open("data/test_Middle.txt", "rb") as f: input_data = f.read().decode("utf-8") # if "\r\n" in input_data: # input_data = input_data.split("\r\n\r\n") # else: input_data = input_data.split("\n\n") pred_token = [] target_token = [] for dialogs in tqdm(input_data): history = [] # if "\r\n" in dialogs: # utterances = dialogs.split("\r\n") # else: utterances = dialogs.split("\n") total = int(len(utterances) / 2) for index in range(total): utterance = utterances[2 * index] text = utterance # print ("user: "******"user:{}\n".format(text)) history.append(tokenizer.encode(text)) input_ids = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 for history_id, history_utr in enumerate( history[-args.max_history_len:]): input_ids.extend(history_utr) input_ids.append(tokenizer.sep_token_id) curr_input_tensor = torch.tensor(input_ids).long().to(device) generated = [] # 最多生成max_len个token for _ in range(args.max_len): over = len(curr_input_tensor) - 300 if over > 0: curr_input_tensor = curr_input_tensor[over:] outputs = model(input_ids=curr_input_tensor) next_token_logits = outputs[0][-1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for id in set(generated): next_token_logits[id] /= args.repetition_penalty next_token_logits = next_token_logits / args.temperature # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token next_token_logits[tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if next_token == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break generated.append(next_token.item()) curr_input_tensor = torch.cat((curr_input_tensor, next_token), dim=0) # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist()) # print("his_text:{}".format(his_text)) text = tokenizer.convert_ids_to_tokens(generated) pred_token.append(text) target_utt = utterances[2 * index + 1] target_token.append(target_utt) history.append(tokenizer.encode(target_utt)) # print("chatbot:" + "".join(text)) # print("target: " + target_utt) if args.save_samples_path: samples_file.write("chatbot:{}\n".format("".join(text))) if args.save_samples_path: samples_file.write("target:{}\n".format(target_utt)) if args.save_samples_path: samples_file.write("\n") # print ("\n") if args.save_samples_path: samples_file.close() ave_len = 0 pred = [] for index in range(len(pred_token)): pred.append(" ".join(pred_token[index])) ave_len += len(pred_token[index]) # target_token = target_token[0:2] bleu_2, bleu_4, meteor, nist_2, nist_4 = get_metrics( pred_token, target_token) entropy, dist = cal_entropy(pred) ave_len /= len(pred_token) print("Bleu_2: ", bleu_2) print("Bleu_4: ", bleu_4) print("Meteor: ", meteor) print("Nist_2: ", nist_2) print("Nist_4: ", nist_4) print("Dist_1: ", dist[0]) print("Dist_2: ", dist[1]) print("Entropy_4: ", entropy[3]) print("Length: ", ave_len)
def run_pplm_example(pretrained_model="gpt2-medium", cond_text="", uncond=False, num_samples=1, bag_of_words=None, discrim=None, discrim_weights=None, discrim_meta=None, class_label=-1, length=100, stepsize=0.02, temperature=1.0, top_k=10, sample=True, num_iterations=3, grad_length=10000, horizon_length=1, window_length=0, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, no_cuda=False, colorama=False, verbosity='regular'): # set Random seed torch.manual_seed(seed) np.random.seed(seed) # set verbosiry verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR) # set the device device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" if discrim == 'generic': set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: discriminator_pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ "pretrained_model"] if pretrained_model != discriminator_pretrained_model: pretrained_model = discriminator_pretrained_model if verbosity_level >= REGULAR: print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model)) # load pretrained model model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True) model.to(device) model.eval() # load tokenizer tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) # Freeze GPT-2 weights for param in model.parameters(): param.requires_grad = False # figure out conditioning text if uncond: tokenized_cond_text = tokenizer.encode([tokenizer.bos_token], add_special_tokens=False) else: raw_text = cond_text while not raw_text: print("Did you forget to add `--cond_text`? ") raw_text = input("Model prompt >>> ") tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text, add_special_tokens=False) print("= Prefix of sentence =") print(tokenizer.decode(tokenized_cond_text)) print() # generate unperturbed and perturbed texts # full_text_generation returns: # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation( model=model, tokenizer=tokenizer, context=tokenized_cond_text, device=device, num_samples=num_samples, bag_of_words=bag_of_words, discrim=discrim, class_label=class_label, length=length, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, window_length=window_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, verbosity_level=verbosity_level) # untokenize unperturbed text unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0]) if verbosity_level >= REGULAR: print("=" * 80) print("= Unperturbed generated text =") print(unpert_gen_text) print() generated_texts = [] bow_word_ids = set() if bag_of_words and colorama: bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) # w[0] because we are sure w has only 1 item because previous fitler bow_word_ids.update(w[0] for w in filtered) # iterate through the perturbed texts for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts): try: # untokenize unperturbed text if colorama: import colorama pert_gen_text = '' for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: pert_gen_text += '{}{}{}'.format( colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL) else: pert_gen_text += tokenizer.decode([word_id]) else: pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0]) print("= Perturbed generated text {} =".format(i + 1)) print(pert_gen_text) print() # saves output to text file: 'samples.txt' load_words_in_text_file(pert_gen_text) except: pass # keep the prefix, perturbed seq, original seq for each index generated_texts.append( (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)) # add inputs to input.txt input_text = tokenizer.decode(tokenized_cond_text) words = input_text.split("<|endoftext|>") if '<|endoftext|>' in words: words.remove('<|endoftext|>') s = "" input_text = s.join(words) emotion = get_emotion(tokenizer.decode(class_label)) #0: no emotion, 1: anger, 2: disgust, 3: fear, 4: happiness, 5: sadness, 6: surprise f = open("input.txt", "w+") f.write(input_text) f.write('\n') f.write(emotion) f.close() return
def main(): args = set_interact_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizer(vocab_file=args.voca_path) model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) model.to(device) model.eval() if args.save_samples_path: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/samples.txt', 'a', encoding='utf8') samples_file.write("聊天记录{}:\n".format(datetime.now())) # 存储聊天记录,每个utterance以token的id的形式进行存储 history = [] print('开始和chatbot聊天,输入CTRL + Z以退出') while True: try: text = input("user:"******"user:{}\n".format(text)) history.append(tokenizer.encode(text)) input_ids = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 for history_id, history_utr in enumerate( history[-args.max_history_len:]): input_ids.extend(history_utr) input_ids.append(tokenizer.sep_token_id) curr_input_tensor = torch.tensor(input_ids).long().to(device) generated = [] # 最多生成max_len个token for _ in range(args.max_len): outputs = model(input_ids=curr_input_tensor) next_token_logits = outputs[0][-1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for id in set(generated): next_token_logits[id] /= args.repetition_penalty next_token_logits = next_token_logits / args.temperature # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token next_token_logits[tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if next_token == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break generated.append(next_token.item()) curr_input_tensor = torch.cat((curr_input_tensor, next_token), dim=0) # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist()) # print("his_text:{}".format(his_text)) history.append(generated) text = tokenizer.convert_ids_to_tokens(generated) print("chatbot:" + "".join(text)) if args.save_samples_path: samples_file.write("chatbot:{}\n".format("".join(text))) except KeyboardInterrupt: if args.save_samples_path: samples_file.close() break
def run_pplm_example_file( pretrained_model="gpt2-medium", file_path="", num_samples=1, bag_of_words=None, discrim=None, discrim_weights=None, discrim_meta=None, class_label=-1, length=100, stepsize=0.02, temperature=1.0, top_k=10, sample=False, num_iterations=3, grad_length=10000, horizon_length=1, window_length=0, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, no_cuda=False, colorama=False, ): # set Random seed torch.manual_seed(seed) np.random.seed(seed) # set the device device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" if discrim == "generic": set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ "pretrained_model"] print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model)) # load pretrained model model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True) model.to(device) model.eval() # load tokenizer tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) # Freeze GPT-2 weights for param in model.parameters(): param.requires_grad = False # figure out conditioning text # with open(file_path, 'r') as f1: # list_text = f1.readlines() list_text = pd.read_csv(file_path, header=None)[0].values.tolist() result = defaultdict(dict) # generate unperturbed and perturbed texts # full_text_generation returns: # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time for index, cond_text in enumerate(list_text): tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + cond_text) unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation( model=model, tokenizer=tokenizer, context=tokenized_cond_text, device=device, num_samples=num_samples, bag_of_words=bag_of_words, discrim=discrim, class_label=class_label, length=length, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, window_length=window_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, ) # untokenize unperturbed text unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0]) result[(index, cond_text)]["unpert"] = unpert_gen_text result[(index, cond_text)]["pert"] = [] bow_word_ids = set() if bag_of_words and colorama: bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) # w[0] because we are sure w has only 1 item because previous fitler bow_word_ids.update(w[0] for w in filtered) # iterate through the perturbed texts for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts): try: # untokenize unperturbed text if colorama: import colorama pert_gen_text = "" for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: pert_gen_text += "{}{}{}".format( colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL) else: pert_gen_text += tokenizer.decode([word_id]) else: pert_gen_text = tokenizer.decode( pert_gen_tok_text.tolist()[0]) # print("= Perturbed generated text {} =".format(i + 1)) # print(pert_gen_text) result[(index, cond_text)]['pert'].append(pert_gen_text) except Exception as exc: print("Ignoring error while generating perturbed text:", exc) # keep the prefix, perturbed seq, original seq for each index # generated_texts.append((tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)) # result[(index, cond_text)] = (cond_text, pert_gen_tok_text) with open("./data/result_{}_{}.pkl".format(discrim, class_label), 'wb') as f1: pickle.dump(result, f1) return
def main(): args = set_interact_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizer(vocab_file=args.voca_path) # 对话model dialogue_model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) dialogue_model.to(device) dialogue_model.eval() # 互信息mmi model mmi_model = GPT2LMHeadModel.from_pretrained(args.mmi_model_path) mmi_model.to(device) mmi_model.eval() if args.save_samples_path: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/mmi_samples.txt', 'a', encoding='utf8') samples_file.write("聊天记录{}:\n".format(datetime.now())) # 存储聊天记录,每个utterance以token的id的形式进行存储 history = [] print('开始和Liam的叛逆机器人Tyrion聊天,输入CTRL + Z以退出') import readline while True: try: text = input("Liam:") if args.save_samples_path: samples_file.write("Liam:{}\n".format(text)) history.append(tokenizer.encode(text)) input_ids = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 for history_id, history_utr in enumerate( history[-args.max_history_len:]): input_ids.extend(history_utr) input_ids.append(tokenizer.sep_token_id) # 用于批量生成response,维度为(batch_size,token_len) input_ids = [ copy.deepcopy(input_ids) for _ in range(args.batch_size) ] curr_input_tensors = torch.tensor(input_ids).long().to(device) generated = [ ] # 二维数组,维度为(生成的response的最大长度,batch_size),generated[i,j]表示第j个response的第i个token的id finish_set = set( ) # 标记是否所有response均已生成结束,若第i个response生成结束,即生成了sep_token_id,则将i放入finish_set # 最多生成max_len个token for _ in range(args.max_len): outputs = dialogue_model(input_ids=curr_input_tensors) next_token_logits = outputs[0][:, -1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for index in range(args.batch_size): for token_id in set( [token_ids[index] for token_ids in generated]): next_token_logits[index][ token_id] /= args.repetition_penalty next_token_logits = next_token_logits / args.temperature # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token for next_token_logit in next_token_logits: next_token_logit[tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) # 判断是否有response生成了[SEP],将已生成了[SEP]的resposne进行标记 for index, token_id in enumerate(next_token[:, 0]): if token_id == tokenizer.sep_token_id: finish_set.add(index) # 检验是否所有的response均已生成[SEP] finish_flag = True # 是否所有的response均已生成[SEP]的token for index in range(args.batch_size): if index not in finish_set: # response批量生成未完成 finish_flag = False break if finish_flag: break generated.append([token.item() for token in next_token[:, 0]]) # 将新生成的token与原来的token进行拼接 curr_input_tensors = torch.cat( (curr_input_tensors, next_token), dim=-1) candidate_responses = [] # 生成的所有候选response for batch_index in range(args.batch_size): response = [] for token_index in range(len(generated)): if generated[token_index][ batch_index] != tokenizer.sep_token_id: response.append(generated[token_index][batch_index]) else: break candidate_responses.append(response) # mmi模型的输入 if args.debug: print("candidate response:") samples_file.write("candidate response:\n") min_loss = float('Inf') best_response = "" for response in candidate_responses: mmi_input_id = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 mmi_input_id.extend(response) mmi_input_id.append(tokenizer.sep_token_id) for history_utr in reversed(history[-args.max_history_len:]): mmi_input_id.extend(history_utr) mmi_input_id.append(tokenizer.sep_token_id) mmi_input_tensor = torch.tensor(mmi_input_id).long().to(device) out = mmi_model(input_ids=mmi_input_tensor, labels=mmi_input_tensor) loss = out[0].item() if args.debug: text = tokenizer.convert_ids_to_tokens(response) print("{} loss:{}".format("".join(text), loss)) samples_file.write("{} loss:{}\n".format("".join(text), loss)) if loss < min_loss: best_response = response min_loss = loss history.append(best_response) text = tokenizer.convert_ids_to_tokens(best_response) print("Tyrion:" + "".join(text)) if args.save_samples_path: samples_file.write("Tyrion:{}\n".format("".join(text))) except KeyboardInterrupt: if args.save_samples_path: samples_file.close() break
def run_pplm_example( pretrained_model="gpt2-medium", cond_text="", uncond=False, num_samples=1, bag_of_words=None, discrim=None, discrim_weights=None, discrim_meta=None, class_label=-1, length=100, stepsize=0.02, temperature=1.0, top_k=10, sample=True, num_iterations=3, grad_length=10000, horizon_length=1, window_length=0, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, no_cuda=False, colorama=False, verbosity='regular' ): # random seed 설정 torch.manual_seed(seed) np.random.seed(seed) # set verbosiry # GPT2 사이즈 설정 과정으로 보임 regular 여서 1으로 반환 verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR) # set the device # gpu,cpu 설정 과정 device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" # discrim 모델 설정 과정 예제는 discrim = 'sentiment' if discrim == 'generic': set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: discriminator_pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ "pretrained_model" ] if pretrained_model != discriminator_pretrained_model: pretrained_model = discriminator_pretrained_model if verbosity_level >= REGULAR: print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model)) # load pretrained model # bow 의 경우 기존의 GPT2 model load model = GPT2LMHeadModel.from_pretrained( pretrained_model, output_hidden_states=True ) model.to(device) model.eval() # load tokenizer tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) # Freeze GPT-2 weights for param in model.parameters(): param.requires_grad = False # figure out conditioning text # 기본값인 uncond=False if uncond: tokenized_cond_text = tokenizer.encode( [tokenizer.bos_token], add_special_tokens=False ) else: # Bow 예제 cond_text="The potato" raw_text = cond_text # cond_text 가 없을시 즉석으로 입력받아 설정 while not raw_text: print("Did you forget to add `--cond_text`? ") raw_text = input("Model prompt >>> ") #그 후 시작을 나타내는 토큰을 붙이고 토큰화 실행 tokenized_cond_text = tokenizer.encode( tokenizer.bos_token + raw_text, add_special_tokens=False ) # 시작 문구 출력 print("= Prefix of sentence =") print(tokenizer.decode(tokenized_cond_text)) print() # generate unperturbed and perturbed texts # full_text_generation returns: # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation( model=model, tokenizer=tokenizer, context=tokenized_cond_text, device=device, num_samples=num_samples, bag_of_words=bag_of_words, discrim=discrim, class_label=class_label, length=length, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, window_length=window_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, verbosity_level=verbosity_level ) # untokenize unperturbed text unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0]) if verbosity_level >= REGULAR: print("=" * 80) print("= Unperturbed generated text =") print(unpert_gen_text) print() generated_texts = [] bow_word_ids = set() if bag_of_words and colorama: bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) # w[0] because we are sure w has only 1 item because previous fitler bow_word_ids.update(w[0] for w in filtered) # iterate through the perturbed texts for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts): try: # untokenize unperturbed text if colorama: import colorama pert_gen_text = '' for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: pert_gen_text += '{}{}{}'.format( colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL ) else: pert_gen_text += tokenizer.decode([word_id]) else: pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0]) print("= Perturbed generated text {} =".format(i + 1)) print(pert_gen_text) print() except: pass # keep the prefix, perturbed seq, original seq for each index generated_texts.append( (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text) ) return