def __init__(self, args): # get the dir with pre-trained model load_dir = os.path.join(args.experiment_dir, args.old_model_dir) # initialize, and load vocab self.vocab = Vocab() vocab_filename = os.path.join(load_dir, "vocab.json") self.vocab.load_from_dict(vocab_filename) # load configuration with open(os.path.join(load_dir, "config.json"), "r") as f: config = json.load(f) args.response_len = config["response_len"] args.history_len = config["history_len"] # initialize an empty dataset. used to get input features self.dataset = DialogueDataset(None, history_len=config["history_len"], response_len=config["response_len"], vocab=self.vocab, update_vocab=False) # set device self.device = torch.device(args.device) # initialize model model = Transformer(config["vocab_size"], config["vocab_size"], config["history_len"], config["response_len"], d_word_vec=config["embedding_dim"], d_model=config["model_dim"], d_inner=config["inner_dim"], n_layers=config["num_layers"], n_head=config["num_heads"], d_k=config["dim_k"], d_v=config["dim_v"], dropout=config["dropout"], pretrained_embeddings=None).to(self.device) # load checkpoint checkpoint = torch.load(os.path.join(load_dir, args.old_model_name), map_location=self.device) model.load_state_dict(checkpoint['model']) # create chatbot self.chatbot = Chatbot(args, model) self.args = args
def get_test_predictions(self, test_filename, save_filename): test_dataset = DialogueDataset( test_filename, self.config.sentence_len, self.vocab, False) test_data_loader = torch.utils.data.DataLoader( test_dataset, self.config.val_batch_size, shuffle=True) with open(test_filename, 'r') as f: data = json.load(f) start = time.clock() phase_metrics = dict() epoch_loss = list() epoch_metrics = list() results = {"accuracy": list(), "precision": list(), "recall": list(), "F1": list()} average_epoch_loss = None for i, batch in enumerate(tqdm(test_data_loader, mininterval=2, desc='test', leave=False)): # prepare data src_seq, src_pos, src_seg, tgt = map( lambda x: x.to(self.device), batch[:4]) ids = batch[4] start_end_idx = batch[5] # forward pred = self.model(src_seq, src_pos, src_seg, tgt) loss = F.cross_entropy(self.prepare_pred(pred).view(-1, 2), tgt.view(-1)) average_loss = float(loss) epoch_loss.append(average_loss) average_epoch_loss = np.mean(epoch_loss) output = torch.argmax(self.prepare_pred(pred), 3) record_predictions(output, data, ids, start_end_idx) get_results(tgt.view(-1).cpu(), output.view(-1).cpu(), results) phase_metrics["avg_results"] = {key: np.mean(value) for key, value in results.items()} phase_metrics["loss"] = average_epoch_loss phase_metrics["time_taken"] = time.clock() - start string = ' {} loss: {:.3f} '.format('test', average_epoch_loss) print(string, end='\n') data["results"] = phase_metrics with open(save_filename, 'w') as f: json.dump(data, f) return phase_metrics
def __init__(self, args): # set up output directory self.output_dir = os.path.join(args.experiment_dir, args.run_name) if not os.path.exists(args.experiment_dir): os.mkdir(args.experiment_dir) if not os.path.exists(self.output_dir): os.mkdir(self.output_dir) if not os.path.exists(os.path.join(args.experiment_dir,"runs/")): os.mkdir(os.path.join(args.experiment_dir,"runs/")) # initialize tensorboard writer self.runs_dir = os.path.join(args.experiment_dir,"runs/",args.run_name) self.writer = SummaryWriter(self.runs_dir) # initialize global steps self.train_gs = 0 self.val_gs = 0 # initialize model config self.config = ModelConfig(args) # check if there is a model to load if args.old_model_dir is not None: self.use_old_model = True self.load_dir = args.old_model_dir self.config.load_from_file( os.path.join(self.load_dir, "config.json")) # create vocab self.vocab = Vocab() self.vocab.load_from_dict(os.path.join(self.load_dir, "vocab.json")) self.update_vocab = False self.config.min_count=1 else: self.use_old_model = False self.vocab = None self.update_vocab = True # create data sets self.dataset_filename = args.dataset_filename # train self.train_dataset = DialogueDataset( os.path.join(self.dataset_filename, "train_data.json"), self.config.sentence_len, self.vocab, self.update_vocab) self.data_loader_train = torch.utils.data.DataLoader( self.train_dataset, self.config.train_batch_size, shuffle=True) self.config.train_len = len(self.train_dataset) self.vocab = self.train_dataset.vocab # eval self.val_dataset = DialogueDataset( os.path.join(self.dataset_filename, "val_data.json"), self.config.sentence_len, self.vocab, self.update_vocab) self.data_loader_val = torch.utils.data.DataLoader( self.val_dataset, self.config.val_batch_size, shuffle=True) self.config.val_len = len(self.val_dataset) # update, and save vocab self.vocab = self.val_dataset.vocab self.train_dataset.vocab = self.vocab if (self.config.min_count > 1): self.config.old_vocab_size = len(self.vocab) self.vocab.prune_vocab(self.config.min_count) self.vocab.save_to_dict(os.path.join(self.output_dir, "vocab.json")) self.vocab_size = len(self.vocab) self.config.vocab_size = self.vocab_size # load embeddings if self.config.pretrained_embeddings_dir is None: pretrained_embeddings = get_pretrained_embeddings(self.config.pretrained_embeddings_dir , self.vocab) else: pretrained_embeddings = None # print and save the config file self.config.print_config(self.writer) self.config.save_config(os.path.join(self.output_dir, "config.json")) # set device self.device = torch.device('cuda') # create model self.model = Transformer( self.config.vocab_size, self.config.label_len, self.config.sentence_len, d_word_vec=self.config.embedding_dim, d_model=self.config.model_dim, d_inner=self.config.inner_dim, n_layers=self.config.num_layers, n_head=self.config.num_heads, d_k=self.config.dim_k, d_v=self.config.dim_v, dropout=self.config.dropout, pretrained_embeddings=pretrained_embeddings ).to(self.device) # create optimizer self.optimizer = torch.optim.Adam( filter(lambda x: x.requires_grad, self.model.parameters()), betas=(0.9, 0.98), eps=1e-09) # load old model, optimizer if there is one if self.use_old_model: self.model, self.optimizer = load_checkpoint( os.path.join(self.load_dir, "model.bin"), self.model, self.optimizer, self.device) # create a sceduled optimizer object self.optimizer = ScheduledOptim( self.optimizer, self.config.model_dim, self.config.warmup_steps)
def run(): logger.info("using device: {}".format(config.DEVICE)) train_data = process_raw_data() train_list, test_list = train_test_split(train_data, test_size=0.2, random_state=34) # 加载GPT2模型 model, n_ctx = create_model(True) model.to(config.DEVICE) # 是否使用多块GPU进行并行运算: 可以选择要使用哪几块显卡来进行训练 multi_gpu = False if torch.cuda.is_available() and torch.cuda.device_count() > 1: logger.info("Using more than one GPUs to train...") os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = config.DEVICE_NUM model = DataParallel( model, device_ids=[int(i) for i in config.DEVICE_NUM.split(",")]) multi_gpu = True # 记录模型参数数量 num_parameters = sum( [parameter.numel() for parameter in model.parameters()]) logger.info("number of model parameters: {}".format(num_parameters)) # 加载数据 logger.info("loading training data") train_dataset = DialogueDataset(train_list, n_ctx) batch_num = len(train_dataset) // config.BATCH_SIZE test_dataset = DialogueDataset(test_list, n_ctx) test_batch_num = len(test_dataset) // config.BATCH_SIZE train_data_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=4, collate_fn=collate_fn) test_data_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=1, collate_fn=collate_fn) # 计算所有epoch进行参数优化的总步数total_steps total_steps = int( len(train_data_loader) * config.EPOCHS / config.BATCH_SIZE / config.GRADIENT_ACCUMULATION) logger.info('total training steps = {}'.format(total_steps)) # 设置优化器,并且在初始训练时,使用warmup策略 optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE, correct_bias=True) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.WARM_STEPS, num_training_steps=total_steps) logger.info("start training...") best_loss = 100 best_accuracy = 0 for epoch in range(config.EPOCHS): train_fn(model, train_data_loader, optimizer, scheduler, epoch, batch_num, multi_gpu) loss, accuracy = eval_fn(model, test_data_loader, test_batch_num, multi_gpu) if loss < best_loss or accuracy > best_accuracy: logger.info('saving model for epoch {}, best loss: {}'.format( epoch + 1, loss)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(config.MODEL_PATH) best_loss = loss best_accuracy = accuracy
class ChatBot: def __init__(self, args): # get the dir with pre-trained model load_dir = os.path.join(args.experiment_dir, args.old_model_dir) # initialize, and load vocab self.vocab = Vocab() vocab_filename = os.path.join(load_dir, "vocab.json") self.vocab.load_from_dict(vocab_filename) # load configuration with open(os.path.join(load_dir, "config.json"), "r") as f: config = json.load(f) args.response_len = config["response_len"] args.history_len = config["history_len"] # initialize an empty dataset. used to get input features self.dataset = DialogueDataset(None, history_len=config["history_len"], response_len=config["response_len"], vocab=self.vocab, update_vocab=False) # set device self.device = torch.device(args.device) # initialize model model = Transformer(config["vocab_size"], config["vocab_size"], config["history_len"], config["response_len"], d_word_vec=config["embedding_dim"], d_model=config["model_dim"], d_inner=config["inner_dim"], n_layers=config["num_layers"], n_head=config["num_heads"], d_k=config["dim_k"], d_v=config["dim_v"], dropout=config["dropout"], pretrained_embeddings=None).to(self.device) # load checkpoint checkpoint = torch.load(os.path.join(load_dir, args.old_model_name), map_location=self.device) model.load_state_dict(checkpoint['model']) # create chatbot self.chatbot = Chatbot(args, model) self.args = args def run(self): logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) greeting_text = "Hello! I am the Hawkbot! Let me tell you about myself" \ "... please dont hurt my feelings!" \ " If you would like to reset "\ "the conversation, please type '/reset'. " # initialize history dictionary for each chat id history = dict() def greeting(bot, update): # reset history, or create new history for chat id if update.message.chat_id in history: id = "{}_history".format(update.message.chat_id) if id in history: history[id].append(history[update.message.chat_id]) else: history[id] = [history[update.message.chat_id]] history[update.message.chat_id].clear() else: history[update.message.chat_id] = list() # send a message bot.send_message(update.message.chat_id, greeting_text) def respond(bot, update): # initialize history for chat if it doesnt exist if update.message.chat_id not in history: greeting(bot, update) else: # get message, and add to history message = update.message.text history[update.message.chat_id].append(message) # get response, and add to history response = self._print_response( history[update.message.chat_id]) history[update.message.chat_id].append(response) # send response from user bot.send_message(update.message.chat_id, clean_response(response)) with open(self.args.save_filename, 'w') as f: json.dump({ "history": history, "args": vars(self.args) }, f, indent=4) # queries sent to: https://api.telegram.org/bot<token>/METHOD_NAME TOKEN = self.args.token bot = TelegramBot(TOKEN) bot.add_handler(MessageHandler(Filters.text, respond)) bot.add_handler(CommandHandler('reset', greeting)) # print the response from the input def _print_response(self, history): # generate responses responses, scores = self._generate_responses(history) # chose response if self.args.choose_best: response = responses[0][0] else: # pick a random result from the n_best idx = random.randint( 0, min(self.args.n_best, self.args.beam_size) - 1) response = responses[0][idx] # uncomment this line to see all the scores # print("scores in log prob: {}\n".format(scores[0])) # create output string output = "" for idx in response[:-1]: token = self.vocab.id2token[idx] output += "{} ".format(token) print(f'{history[-1]} -> {output}') return output def _generate_responses(self, history): # get input features for the dialogue history h_seq, h_pos, h_seg = self.dataset.get_input_features(history) # get response from model response = self.chatbot.translate_batch(h_seq, h_pos, h_seg) return response
def run(): logger.info("using device: {}".format(config.DEVICE)) if config.TRAIN_MMI: train_data = process_mmi_raw_data() else: train_data = process_raw_data() train_list, test_list = train_test_split(train_data, test_size=0.2, random_state=34) # 加载GPT2模型 model, n_ctx = create_model(mmi=config.TRAIN_MMI) model.to(config.DEVICE) # 是否使用多块GPU进行并行运算: 可以选择要使用哪几块显卡来进行训练 multi_gpu = False if torch.cuda.is_available() and torch.cuda.device_count() > 1: logger.info("Using GPU to train...") model = DataParallel( model, device_ids=[int(i) for i in config.DEVICE_NUM.split(",")]) multi_gpu = True else: logger.info("Using cpu to train...") # 记录模型参数数量 num_parameters = sum( [parameter.numel() for parameter in model.parameters()]) logger.info("number of model parameters: {}".format(num_parameters)) # 加载数据 logger.info("loading training data") train_dataset = DialogueDataset(train_list, n_ctx) batch_num = len(train_dataset) // config.BATCH_SIZE test_dataset = DialogueDataset(test_list, n_ctx) test_batch_num = len(test_dataset) // config.BATCH_SIZE train_data_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=config.TRAIN_NUM_WORKERS, collate_fn=collate_fn) test_data_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=config.TEST_NUM_WORKERS, collate_fn=collate_fn) # 计算所有epoch进行参数优化的总步数total_steps total_steps = int( len(train_data_loader) * config.EPOCHS / config.BATCH_SIZE / config.GRADIENT_ACCUMULATION) logger.info('total training steps = {}'.format(total_steps)) # 设置优化器,并且在初始训练时,使用warmup策略 optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE, correct_bias=True) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.WARM_STEPS, num_training_steps=total_steps) logger.info("start training...") best_accuracy = 0 best_loss = 100 for epoch in range(config.EPOCHS): epoch_start_time = datetime.now() train_fn(model, train_data_loader, optimizer, scheduler, epoch, batch_num, multi_gpu) logger.info("time for epoch {}: {}".format( epoch + 1, datetime.now() - epoch_start_time)) loss, accuracy = eval_fn(model, test_data_loader, test_batch_num, multi_gpu) if accuracy > best_accuracy or loss < best_loss: logger.info( 'saving model for epoch {}, best accuracy is {}'.format( epoch + 1, accuracy)) if config.TRAIN_MMI: # 当前训练MMI模型 model_path = config.MMI_MODEL_PATH else: model_path = config.DIALOGUE_MODEL_PATH if not os.path.exists(model_path): os.mkdir(model_path) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(model_path) best_accuracy = accuracy best_loss = loss