def infer(save_model_path, test_id_path, test_word_path, test_label_path, word_dict_path=None, label_dict_path=None, save_pred_path=None, batch_size=64, dropout=0.5, embedding_dim=100, rnn_hidden_dim=200, maxlen=300): # load dict test_ids = load_test_id(test_id_path) word_ids_dict, ids_word_dict = load_dict( word_dict_path), load_reverse_dict(word_dict_path) label_ids_dict, ids_label_dict = load_dict( label_dict_path), load_reverse_dict(label_dict_path) # read data to index word_ids = vectorize_data(test_word_path, word_ids_dict) # pad sequence word_seq = pad_sequence(word_ids, maxlen) # load model by file model = load_model(word_ids_dict, label_ids_dict, embedding_dim, rnn_hidden_dim, dropout, save_model_path) probs = model.predict(word_seq, batch_size=batch_size, verbose=0).argmax(-1) assert len(probs) == len(word_seq) print('probs.shape:', probs.shape) test_words, test_labels = load_test_id(test_word_path), load_test_id( test_label_path) save_preds(probs, test_ids, word_seq, ids_word_dict, label_ids_dict, ids_label_dict, save_pred_path, test_words, test_labels)
def train(train_word_path=None, train_label_path=None, word_dict_path=None, label_dict_path=None, save_model_path=None, batch_size=64, dropout=0.5, epoch=10, embedding_dim=100, rnn_hidden_dim=200, maxlen=300, cutoff_frequency=0): """ Train the bilstm_crf model for grammar correction. """ # build the word dictionary build_dict(train_word_path, word_dict_path, cutoff_frequency, insert_extra_words=[UNK_TOKEN, PAD_TOKEN]) # build the label dictionary build_dict(train_label_path, label_dict_path) # load dict word_ids_dict = load_dict(word_dict_path) label_ids_dict = load_dict(label_dict_path) # read data to index word_ids = vectorize_data(train_word_path, word_ids_dict) label_ids = vectorize_data(train_label_path, label_ids_dict) max_len = np.max([len(i) for i in word_ids]) print('max_len:', max_len) # pad sequence word_seq = pad_sequence(word_ids, maxlen=maxlen) label_seq = pad_sequence(label_ids, maxlen=maxlen) # reshape label for crf model use label_seq = np.reshape(label_seq, (label_seq.shape[0], label_seq.shape[1], 1)) print(word_seq.shape) print(label_seq.shape) logger.info("Data loaded.") # model logger.info("Training BILSTM_CRF model...") model = create_model(word_ids_dict, label_ids_dict, embedding_dim, rnn_hidden_dim, dropout) # callback callbacks_list = callback(save_model_path, logger) # fit model.fit(word_seq, label_seq, batch_size=batch_size, epochs=epoch, validation_split=0.2, callbacks=callbacks_list) logger.info("Training has finished.")
def infer(save_model_path, test_id_path, test_word_path, test_label_path, word_dict_path=None, label_dict_path=None, save_pred_path=None, batch_size=64, dropout=0.5, embedding_dim=100, rnn_hidden_dim=200, maxlen=300): # load dict test_ids = load_test_id(test_id_path) word_ids_dict, ids_word_dict = load_dict(word_dict_path), load_reverse_dict(word_dict_path) label_ids_dict, ids_label_dict = load_dict(label_dict_path), load_reverse_dict(label_dict_path) # read data to index word_ids = vectorize_data(test_word_path, word_ids_dict) # pad sequence word_seq = pad_sequence(word_ids, maxlen) # load model by file model = load_model(word_ids_dict, label_ids_dict, embedding_dim, rnn_hidden_dim, dropout, save_model_path) probs = model.predict(word_seq, batch_size=batch_size, verbose=0).argmax(-1) assert len(probs) == len(word_seq) print('probs.shape:', probs.shape) test_words, test_labels = load_test_id(test_word_path), load_test_id(test_label_path) save_preds(probs, test_ids, word_seq, ids_word_dict, label_ids_dict, ids_label_dict, save_pred_path, test_words, test_labels)