def get_baseline_model(): # get the train and predict model model embedding_file, vocab_size = get_default_inputs_for_model() qa_model = QAModel() train_model, prediction_model = qa_model.get_lstm_cnn_model( embedding_file, vocab_size) logger.info('Default created: Baseline') logger.info('enc_timesteps = 30,\ dec_timesteps = 30, hidden_dim = 50, filters = 500, kernel_sizes = [2, 3, 5, 7]' ) return train_model, prediction_model
class Bot: def __init__(self): self.qa_model = QAModel() def reply(self, context, log): assert isinstance(context, str) assert isinstance(log, list) if not log: return self.message(self.prolog()) return self.message(self.conversate(log[-1], context)) def prolog(self): return "Hello! I'm the buisness question answering bot. ask me any question in regard to the buisness." def ans(self, question, context): try: answer = self.qa_model.forward(question, context) except: return "I'm struggling to find an answer." if answer in {"", " ", "[CLS]"}: return "I couldn't find an answer. try to ask differently." return self.normalize(answer) def conversate(self, question, context): if question['sender'] == 'bot': return "Ask me a question!" try: answer = self.qa_model.forward(question['text'], context) except: return "I'm struggling to find an answer." if answer in {"", " ", "[CLS]"}: return "I couldn't find an answer. try to ask differently." return self.normalize(answer) def normalize(self, answer): answer = re.sub(' ,', ',', answer) answer = re.sub(' ’ ', '’', answer) answer = answer[0].upper() + answer[1:] return answer + '.' def message(self, text): return {"sender": "bot", "datetime": str(dt.now()), "text": text}
def get_small_model(): # small model embedding_file, vocab_size = get_default_inputs_for_model() enc_timesteps = 30 dec_timesteps = 30 hidden_dim = 10 filters = 20 qa_model = QAModel() small_train_model, small_prediction_model = qa_model.get_lstm_cnn_model( embedding_file, vocab_size, enc_timesteps=enc_timesteps, dec_timesteps=dec_timesteps, filters=filters, hidden_dim=hidden_dim) logger.info('Model created: Small') logger.info(f'enc_timesteps = {enc_timesteps},\ dec_timesteps = {dec_timesteps},' f' hidden_dim = {hidden_dim}, filters = {filters}, ' f'kernel_sizes = [2, 3, 5, 7]') return small_train_model, small_prediction_model
def load_model(self, state_path): """ Initialises the model and loads saved state into the instance of the model. Parameters ---------- state_path (str) - path pointing to the saved state. Returns ------- Model (torch.nn.Module) """ logging.info(f"Loading trained state from {state_path}") dbm = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True) device = torch.device(self.device) dbm.to(device) model = QAModel(transformer_model=dbm, device=device) # checkpoint = torch.load(state_path, map_location=device) model.load_state_dict(torch.load(state_path)) model.eval() # Switch to evaluation mode return model
def get_larger_model(): enc_timesteps = 30 dec_timesteps = 30 hidden_dim = 200 filters = 500 embedding_file, vocab_size = get_default_inputs_for_model() qa_model = QAModel() larger_train_model, larger_prediction_model = qa_model.get_lstm_cnn_model( embedding_file, vocab_size, enc_timesteps=enc_timesteps, dec_timesteps=dec_timesteps, filters=filters, hidden_dim=hidden_dim) logger.info('Model created: Larger') logger.info(f'enc_timesteps = {enc_timesteps},\ dec_timesteps = {dec_timesteps},' f' hidden_dim = {hidden_dim}, filters = {filters}, ' f'kernel_sizes = [2, 3, 5, 7]') return larger_train_model, larger_prediction_model
def load_model(state_path, device="cpu"): logging.info(f"Loading trained state from {state_path}") dbm = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True) device = torch.device(device) dbm.to(device) model = QAModel(transformer_model=dbm, device=device) checkpoint = torch.load(state_path, map_location=device) model.load_state_dict(checkpoint['model_state_dict']) model.eval() # Switch to evaluation mode return model
def __init__(self, model_file, word_embeddings_cache_file, stopwords_file, word2dfs_file): # init torch random seeds torch.manual_seed(1234) np.random.seed(1234) # load model self.model = QAModel.load('', model_file) # load vectors self.vec_dim = self._preload_cached_embeddings( word_embeddings_cache_file) self.unk_term_vec = np.random.uniform(-0.25, 0.25, self.vec_dim) # stopwords self.stoplist = set([line.strip() for line in open(stopwords_file)]) # word dfs if os.path.isfile(word2dfs_file): with open(word2dfs_file, "rb") as w2dfin: self.word2dfs = pickle.load(w2dfin)
help='Indexed test JSON file') parser.add_argument('-id2c', '--id2char', type=str, default=None, help='id2char JSON file') args = parser.parse_args() if args.id2char is None: char_vocab_size = 44 else: char_vocab_size = len(load_data(args.id2char)) + 2 learning_rate = 0.001 args.char_vocab_size = char_vocab_size args.embed_mat = numpy.load(args.embed_mat_path) G = QAModel(args) model = G.create_model_graph() print "Compiling model.." # print "Learning rate:", learning_rate opt = Adam(lr=learning_rate, clipnorm=5.0) model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) model.load_weights(args.weightpath) print "Model loaded..." ################ Evaluating on Validation data ################ print "Validation data loading" textdatapath = args.dev_json processed_data = args.tok_dev_json
def main(mode='test', question=None, answers=None): """ This function is used to train, predict or test Args: mode (str): train/preddict/test question (str): this contains the question answers (list): this contains list of answers in string format Returns: index (integer): index of the most likely answer """ # get the train and predict model model vocabulary = Vocabulary("./data/vocab_all.txt") embedding_file = "./data/word2vec_100_dim.embeddings" qa_model = QAModel() train_model, predict_model = qa_model.get_bilstm_model( embedding_file, len(vocabulary)) epoch = 1 if mode == 'train': for i in range(epoch): print('Training epoch', i) # load training data qa_data = QAData() questions, good_answers, bad_answers = qa_data.get_training_data() # train the model Y = np.zeros(shape=(questions.shape[0], )) train_model.fit([questions, good_answers, bad_answers], Y, epochs=1, batch_size=64, validation_split=0.1, verbose=1) # save the trained model train_model.save_weights('model/train_weights_epoch_' + str(epoch) + '.h5', overwrite=True) predict_model.save_weights('model/predict_weights_epoch_' + str(epoch) + '.h5', overwrite=True) elif mode == 'predict': # load the evaluation data data = pickle.load(open("./data/dev.pkl", 'rb')) random.shuffle(data) # load weights from trained model qa_data = QAData() predict_model.load_weights('model/lstm_predict_weights_epoch_1.h5') c = 0 c1 = 0 for i, d in enumerate(data): print(i, len(data)) # pad the data and get it in desired format indices, answers, question = qa_data.process_data(d) # get the similarity score sims = predict_model.predict([question, answers]) n_good = len(d['good']) max_r = np.argmax(sims) max_n = np.argmax(sims[:n_good]) r = rankdata(sims, method='max') c += 1 if max_r == max_n else 0 c1 += 1 / float(r[max_r] - r[max_n] + 1) precision = c / float(len(data)) mrr = c1 / float(len(data)) print("Precision", precision) print("MRR", mrr) elif mode == 'test': # question and answers come from params qa_data = QAData() answers, question = qa_data.process_test_data(question, answers) # load weights from the trained model predict_model.load_weights('model/lstm_predict_weights_epoch_1.h5') # get similarity score sims = predict_model.predict([question, answers]) max_r = np.argmax(sims) return max_r
if os.path.isfile(baseexp + '/results.txt'): mode = 'a' else: mode = 'w' with open(baseexp + '/results.txt', mode) as fp: fp.write("######RESULTS######\n") # Initializations for tracking the best model prev_best_em = 0.0 prev_best_f1 = 0.0 prev_best_epoch = args.initial_epoch - 1 logging.info('=' * 100) for epoch in range(args.initial_epoch, args.num_epoch): logging.info("Epoch: %s", str(epoch)) G = QAModel(args) model = G.create_model_graph() logging.info("Compiling model..") # print "Learning rate:", args.learning_rate model = G.compile_model(model) logging.info("Model compiled..") exp = baseexp + "/epoch" + str(epoch) if not os.path.isdir(exp): os.makedirs(exp) if args.pretrained_weightpath is not None and epoch == args.initial_epoch: logging.info("Loading a pretrained weight") model.load_weights(args.pretrained_weightpath) logging.info("Evaluating the pretrained model")
@author: tarun """ from data import QAData, Vocabulary from model import QAModel import pickle import numpy as np import random from keras.models import Model import matplotlib.pyplot as plt vocabulary = Vocabulary("./data/vocab_all.txt") embedding_file = "./data/word2vec_100_dim.embeddings" qa_model = QAModel() train_model, predict_model = qa_model.get_lstm_cnn_model(embedding_file, len(vocabulary)) # layer_outputs = [predict_model.layers[0].output, predict_model.layers[1].output, predict_model.layers[2].layers[0].output, predict_model.layers[2].layers[0].output, # predict_model.layers[2].layers[1].output, predict_model.layers[2].layers[2].get_output_at(0), # predict_model.layers[2].layers[2].get_output_at(1), predict_model.layers[2].layers[3].get_output_at(0), # predict_model.layers[2].layers[3].get_output_at(1), predict_model.layers[2].layers[4].get_output_at(0), # predict_model.layers[2].layers[4].get_output_at(1), predict_model.layers[2].layers[5].output, # predict_model.layers[2].layers[6].output, predict_model.layers[2].layers[7].get_output_at(0), # predict_model.layers[2].layers[7].get_output_at(1), predict_model.layers[2].layers[8].get_output_at(0), # predict_model.layers[2].layers[8].get_output_at(1), predict_model.layers[2].layers[9].get_output_at(0), # predict_model.layers[2].layers[9].get_output_at(1), predict_model.layers[2].layers[10].get_output_at(0), # predict_model.layers[2].layers[10].get_output_at(1), predict_model.layers[2].layers[11].output, # predict_model.layers[2].layers[11].output, predict_model.layers[2].layers[13].get_output_at(0), # predict_model.layers[2].layers[13].get_output_at(1), predict_model.layers[2].layers[14].output]
def main(mode='test'): # get the train and predict model model vocabulary = Vocabulary("./data/vocab_all.txt") embedding_file = "./data/word2vec_100_dim.embeddings" qa_model = QAModel() train_model, predict_model = qa_model.get_lstm_cnn_model(embedding_file, len(vocabulary)) epo = 100 if mode == 'train': # load training data qa_data = QAData() questions, good_answers, bad_answers = qa_data.get_training_data() callbacks = [EarlyStopping(monitor='val_loss', patience=20), ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)] # train the model Y = np.zeros(shape=(questions.shape[0],)) train_model.fit([questions, good_answers, bad_answers], Y, epochs=epo, batch_size=64, validation_split=0.1, verbose=1, callbacks=callbacks) # save the trained model # train_model.save_weights('model/train_weights_epoch_' + str(epo) + '.h5', overwrite=True) model = keras.models.load_model('best_model.h5') model.save_weights('model/best_weights_epoch_' + str(epo) + '.h5', overwrite=True) predict_model.save_weights('model/predict_weights_epoch_' + str(epo) + '.h5', overwrite=True) elif mode == 'predict': # load the evaluation data data = pickle.load(open("./data/dev.pkl",'rb')) random.shuffle(data) # load weights from trained model qa_data = QAData() model_filenames = ['model/best_model.h5', 'model/predict_weights_epoch_' + str(epo) + '.h5'] for model_name in model_filenames: predict_model.load_weights(model_name) c = 0 c1 = 0 for i, d in enumerate(data): if i%100 == 0: print(i, len(data)) # pad the data and get it in desired format indices, answers, question = qa_data.process_data(d) # get the similarity score sims = predict_model.predict([question, answers]) n_good = len(d['good']) max_r = np.argmax(sims) max_n = np.argmax(sims[:n_good]) r = rankdata(sims, method='max') c += 1 if max_r == max_n else 0 c1 += 1 / float(r[max_r] - r[max_n] + 1) precision = c / float(len(data)) mrr = c1 / float(len(data)) print(f'Results for: model: {model_name}') print("Precision", precision) print("MRR", mrr)
def __init__(self): self.qa_model = QAModel()
def main(): torch.manual_seed(94) batch_size = 4 train_data_list = load_data_list('./train_data_list') train_set = QADataset(train_data_list) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True) dev_data_list = load_data_list('./dev_data_list') dev_set = QADataset(dev_data_list) dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=True) model = QAModel() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) EPOCHS = 6 loss_fn = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=5e-5) loss = 0 for epoch in range(EPOCHS): cum_loss = 0 print(f'epoch: {epoch}') ite = 0 model.train() for i, (question_id, context, question, answerable, ans_start, ans_end) in enumerate(tqdm(train_loader)): ite = i + 1 pt = tokenizer(context, question, return_tensors='pt') bs = len(context) mask = torch.zeros(bs, 512).bool() mask[:, 466:] = True if torch.cuda.is_available(): answerable = answerable.cuda() ans_start = ans_start.cuda() ans_end = ans_end.cuda() pt['input_ids'] = pt['input_ids'].cuda() pt['token_type_ids'] = pt['token_type_ids'].cuda() pt['attention_mask'] = pt['attention_mask'].cuda() mask = mask.cuda() target = torch.cat((ans_start.unsqueeze(1), ans_end.unsqueeze(1)), dim=1).to(device) optimizer.zero_grad() output = model(pt) output[:, :, 0].masked_fill_(mask, float('-inf')) output[:, :, 1].masked_fill_(mask, float('-inf')) loss = loss_fn(output, target) cum_loss += float(loss) loss.backward() optimizer.step() print(cum_loss) model.eval() dev_loss = 0 dev_ite = 0 for i, (question_id, context, question, answerable, ans_start, ans_end) in enumerate(tqdm(dev_loader)): bs = len(context) mask = torch.zeros(bs, 512).bool() mask[:, 466:] = 1 with torch.no_grad(): dev_ite = i + 1 pt = tokenizer(context, question, return_tensors='pt') if torch.cuda.is_available(): answerable = answerable.cuda() ans_start = ans_start.cuda() ans_end = ans_end.cuda() pt['input_ids'] = pt['input_ids'].cuda() pt['token_type_ids'] = pt['token_type_ids'].cuda() pt['attention_mask'] = pt['attention_mask'].cuda() mask = mask.cuda() target = torch.cat( (ans_start.unsqueeze(1), ans_end.unsqueeze(1)), dim=1).to(device) output = model(pt) output[:, :, 0].masked_fill_(mask, float('-inf')) output[:, :, 1].masked_fill_(mask, float('-inf')) loss = loss_fn(output, target) dev_loss += float(loss) print('avg_train_loss: {}, avg_dev_loss: {}'.format( cum_loss / ite, dev_loss / dev_ite)) SAVED_MDL_PATH = './model/' + str(epoch + 1) + '.pt' #torch.save(model.state_dict(), SAVED_MDL_PATH) torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss }, SAVED_MDL_PATH) print('model {} saved'.format(SAVED_MDL_PATH))
def predict(MDL_PATH, DATA_PATH): batch_size = 4 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = QAModel() model.to(device) optimizer = optim.SGD(model.parameters(), lr=3e-5) checkpoint = torch.load(MDL_PATH) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] loss = checkpoint['loss'] model.eval() dev_data_list = load_data_list(DATA_PATH) dev_set = QADataset(dev_data_list) dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=False) print('run prediction') dic = {} for i, (question_id, context, question, answerable, ans_start, ans_end) in enumerate(tqdm(dev_loader)): bs = len(context) mask = torch.zeros(bs, 512).bool() mask[:, 466:] = 1 with torch.no_grad(): dev_ite = i + 1 pt = tokenizer(context, question, return_tensors='pt') if torch.cuda.is_available(): answerable = answerable.cuda() ans_start = ans_start.cuda() ans_end = ans_end.cuda() pt['input_ids'] = pt['input_ids'].cuda() pt['token_type_ids'] = pt['token_type_ids'].cuda() pt['attention_mask'] = pt['attention_mask'].cuda() mask = mask.cuda() target = torch.cat((ans_start.unsqueeze(1), ans_end.unsqueeze(1)), dim=1).to(device) output = model(pt) # shape (batch_size, 512, 2) output[:, :, 0].masked_fill_(mask, float('-inf')) output[:, :, 1].masked_fill_(mask, float('-inf')) for batch_idx, sample in enumerate( output): # sample: shape (512, 2) start = sample[:, 0] # start: shape (512) end = sample[:, 1] start_candidates = torch.topk(start, k=30) end_candidates = torch.topk(end, k=30) ans_candidates = [] scores = [] for i, s in enumerate(start_candidates[1]): for j, e in enumerate(end_candidates[1]): if e == s and e == 0: ans_candidates.append((s, e)) scores.append(start_candidates[0][i] + end_candidates[0][j]) if s < e and e - s <= 30: ans_candidates.append((s, e)) scores.append(start_candidates[0][i] + end_candidates[0][j]) results = list(zip(scores, ans_candidates)) results.sort() results.reverse() if results[0][1][0] == 0: dic[question_id[batch_idx]] = "" else: s, e = results[0][1][0], results[0][1][1] ids = pt['input_ids'][batch_idx][s:e] dic[question_id[batch_idx]] = tokenizer.decode( ids).replace(" ", "") with open('prediction.json', 'w') as fp: json.dump(dic, fp)
torch.set_num_threads(args.num_threads) train_set, dev_set, test_set = 'train-all', 'raw-dev', 'raw-test' if args.train: train_set, dev_set, test_set = 'train', 'clean-dev', 'clean-test' # cache word embeddings cache_file = os.path.splitext(args.word_vectors_file)[0] + '.cache' utils.cache_word_embeddings(args.word_vectors_file, cache_file) vocab_size, vec_dim = utils.load_embedding_dimensions(cache_file) # instantiate model net = QAModel(vec_dim, args.filter_width, args.num_conv_filters, args.no_ext_feats, cuda=args.cuda) # initialize the trainer trainer = Trainer(net, args.eta, args.mom, args.no_loss_reg, vec_dim, args.cuda) logger.info("Loading input data...") # load input data trainer.load_input_data(args.dataset_folder, cache_file, train_set, dev_set, test_set) logger.info("Setting up external features...") # setup external features # TODO: remember to update args.* in testing loop below if args.paper_ext_feats: logger.info("--paper-ext-feats")
torch.manual_seed(1234) np.random.seed(1234) train_set, dev_set, test_set = 'train', 'clean-dev', 'clean-test' if args.train_all: train_set, dev_set, test_set = 'train-all', 'raw-dev', 'raw-test' # cache word embeddings cache_file = os.path.splitext(args.word_vectors_file)[0] + '.cache' utils.cache_word_embeddings(args.word_vectors_file, cache_file) vocab_size, vec_dim = utils.load_embedding_dimensions(cache_file) # instantiate model net = QAModel(vec_dim, args.filter_width, args.num_conv_filters, args.no_ext_feats) #filter width is 5 QAModel.save(net, args.dataset_folder, args.model_fname) torch.set_num_threads(args.num_threads) trainer = Trainer(net, args.eta, args.mom, args.no_loss_reg, vec_dim) logger.info("Loading input data...") trainer.load_input_data(args.dataset_folder, cache_file, train_set, dev_set, test_set) best_map = 0.0 best_model = 0 for i in range(args.epochs): logger.info('------------- Training epoch {} --------------'.format(i+1)) train_accuracy = trainer.train(train_set, args.batch_size, args.debugSingleBatch) if args.debugSingleBatch: sys.exit(0)