tokenizer = load_tokenizer(args.tokenizer_path) num_vocab = tokenizer.get_vocab_size() logger.info("build model") model = setup_model(num_vocab, args.emb_size, args.hid_size, args.num_class) logger.info("load model") model = load_model(model, args.model_path) x = encode_input(tokenizer, args.q1, args.q2) logger.info("predict the label") y_pred = model(x) y_pred = y_pred.argmax().unsqueeze(dim=0) print(decode_label(y_pred.detach().numpy())) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-path", help="model path", required=True) parser.add_argument("--tokenizer-path", help="tokenizer path saved from train", required=True) parser.add_argument("--q1", help="question 1", required=True) parser.add_argument("--q2", help="question 2", required=True) parser.add_argument( "--emb-size", help="embedding size for embedding layer", default=512, type=int) parser.add_argument( "--hid-size", help="hidden size in lstm", default=512, type=int) parser.add_argument( "--num-class", help="number of class target", default=2, type=int) args = parser.parse_args() logger = log(path="logs/", file="lstm.log") main(args)
w2v_model.train( sentences=sent, total_examples=w2v_model.corpus_count, epochs=20, report_delay=1, compute_loss=True, # set compute_loss = True callbacks=[callback()]) # add the callback class logger.info("Save word embedding pickle") w2v = dict(zip(w2v_model.wv.index2word, w2v_model.wv.vectors)) with open(model_path + 'w2v_embed.pkl', 'wb') as file: pickle.dump(w2v, file) logger.info("Save Word2Vec model") w2v_model.save(model_path + 'word2vec.model') w2v_model.wv.most_similar(['dog']) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--clean_data', type=str, default='../data/clean_quora_duplicate_questions.csv') parser.add_argument('--model_path', type=str, default='model/') opt = parser.parse_args() logger = log(path="logs/", file="word2vec.log") main(opt.clean_data, opt.model_path)
if os.path.exists(model_path): model = load(model_path) return model else: ValueError("model not found") def main(model_path, q1, q2): logger.info("load model") vectorizer_1, vectorizer_2, model = load_model(model_path) logger.info("text cleansing") q1 = clean_text(q1) q2 = clean_text(q2) logger.info("text transformation") vec_q1 = vectorizer_1.transform(np.array([q1])) vec_q2 = vectorizer_2.transform(np.array([q2])) questions = concat(vec_q1, vec_q2) logger.info("predict the label") y_pred = model.predict(questions) print(decode_label(y_pred)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-path", help="model path", required=True) parser.add_argument("--q1", help="question 1", required=True) parser.add_argument("--q2", help="question 2", required=True) args = parser.parse_args() logger = log(path="logs/", file="ensemble.log") main(args.model_path, args.q1, args.q2)
f1 = f1_score(y_val, y_pred) prec = precision_score(y_val, y_pred) logger.info('KFold -%s Accuracy: %s', kf, accuracy) logger.info('KFold -%s F1: %s', kf, f1) logger.info('KFold -%s Precision: %s', kf, prec) total_acc.append(accuracy) total_f1.append(f1) total_prec.append(prec) logger.info('--------------------------------') logger.info('==================================') logger.info('Performance') logger.info('Accuracy: %s', sum(total_acc)/5) logger.info('F1: %s', sum(total_f1)/5) logger.info('Precision: %s', sum(total_prec)/5) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--clean_data', type=str, default='../data/clean_quora_duplicate_questions.csv') parser.add_argument('--kfold_data', type=str, default='../data/cross_validation_data') parser.add_argument('--word_embed', type=str, default='model/w2v_embed.pkl') opt = parser.parse_args() logger = log(path="logs/", file="word2vec_cosine.log") main(opt.clean_data, opt.kfold_data, opt.word_embed)
model.eval() with torch.no_grad(): logits = model(input_id, token_type_ids=token_type_id, attention_mask=attention_mask) logits = logits[0].detach().cpu().numpy() pred = np.argmax(logits, axis=1) pred_label = decode_label(pred) print("Result Prediction: ", pred_label) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--model_path', type=str, default='model/') parser.add_argument('--q1', type=str, default='') parser.add_argument('--q2', type=str, default='') parser.add_argument('--test_data', type=str, default='../data/test.tsv') parser.add_argument('--batch_size', type=int, default=32) opt = parser.parse_args() logger = log(path="logs/", file="bert_inference.log") device_type = "cuda" if torch.cuda.is_available() else "cpu" device = torch.device(device_type) logger.debug(device) single_infer(opt.model_path, opt.q1, opt.q2) # main(opt.model_path, opt.test_data, opt.batch_size)