def create_data_2_train(train_path, dev_path, test_path, char_embedd_dim): word_sentences_train, label_sentences_train = load_conll_data(train_path) word_sentences_dev, label_sentences_dev = load_conll_data(dev_path) word_sentences_test, label_sentences_test = load_conll_data(test_path) max_length_train = utils.get_max_length(word_sentences_train) max_length_dev = utils.get_max_length(word_sentences_dev) max_length_test = utils.get_max_length(word_sentences_test) max_length = max(max_length_train, max_length_dev, max_length_test) label_sentences_id_train, alphabet_label = utils.map_string_2_id_open( label_sentences_train, 'pos') alphabet_label.save('pre-trained-model/pos', name='alphabet_label') label_sentences_id_dev = utils.map_string_2_id_close( label_sentences_dev, alphabet_label) label_sentences_id_test = utils.map_string_2_id_close( label_sentences_test, alphabet_label) word_train, label_train, mask_train = \ utils.construct_tensor_word(word_sentences_train, label_sentences_id_train, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) word_dev, label_dev, mask_dev = \ utils.construct_tensor_word(word_sentences_dev, label_sentences_id_dev, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) word_test, label_test, mask_test = \ utils.construct_tensor_word(word_sentences_test, label_sentences_id_test, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) alphabet_char = LabelEncoder('char') alphabet_char.get_index(word_end) index_sentences_train, max_char_length_train = utils.get_character_indexes( word_sentences_train, alphabet_char) alphabet_char.close() char_embedd_table = utils.build_char_embedd_table(char_embedd_dim, alphabet_char) alphabet_char.save('pre-trained-model/pos', name='alphabet_char') index_sentences_dev, max_char_length_dev = utils.get_character_indexes( word_sentences_dev, alphabet_char) index_sentences_test, max_char_length_test = utils.get_character_indexes( word_sentences_test, alphabet_char) max_char_length = max(max_char_length_train, max_char_length_dev, max_char_length_test) char_train = utils.construct_tensor_char(index_sentences_train, max_length, max_char_length, alphabet_char) char_dev = utils.construct_tensor_char(index_sentences_dev, max_length, max_char_length, alphabet_char) char_test = utils.construct_tensor_char(index_sentences_test, max_length, max_char_length, alphabet_char) num_labels = alphabet_label.size() - 1 num_data = word_train.shape[0] return word_train, word_dev, word_test, char_train, char_dev, char_test, mask_train, mask_dev, mask_test, \ label_train, label_dev, label_test, alphabet_label, alphabet_char, max_length, max_char_length, \ char_embedd_table, num_labels, num_data
def load_data_pos(sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim, max_sent_length, max_char_length, alphabet_char): words, masks = utils.construct_tensor_word(sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim, max_sent_length) index_chars = utils.get_character_indexes(sents, alphabet_char, max_char_length) chars = utils.construct_tensor_char(index_chars, max_sent_length, max_char_length, alphabet_char) return words, masks, chars
def load_data_chunk(sents, pos_sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim, max_sent_length, max_char_length, alphabet_char, alphabet_pos): words, masks = utils.construct_tensor_word(sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim, max_sent_length) index_poss = utils.map_string_2_id_close(pos_sents, alphabet_pos) poss = utils.construct_tensor_onehot(index_poss, max_sent_length, alphabet_pos.size()) words = np.concatenate((words, poss), axis=2) index_chars = utils.get_character_indexes(sents, alphabet_char, max_char_length) chars = utils.construct_tensor_char(index_chars, max_sent_length, max_char_length, alphabet_char) return words, masks, chars
def create_data_2_train(train_path, dev_path, test_path, char_embedd_dim): word_sentences_train, pos_sentences_train, chunk_sentences_train, label_sentences_train = load_conll_data(train_path) word_sentences_dev, pos_sentences_dev, chunk_sentences_dev, label_sentences_dev = load_conll_data(dev_path) word_sentences_test, pos_sentences_test, chunk_sentences_test, label_sentences_test = load_conll_data(test_path) max_length_train = utils.get_max_length(word_sentences_train) max_length_dev = utils.get_max_length(word_sentences_dev) max_length_test = utils.get_max_length(word_sentences_test) max_length = max(max_length_train, max_length_dev, max_length_test) pos_sentences_id_train, alphabet_pos = utils.map_string_2_id_open(pos_sentences_train, 'pos') alphabet_pos.save('pre-trained-model/ner', name='alphabet_pos') pos_sentences_id_dev = utils.map_string_2_id_close(pos_sentences_dev, alphabet_pos) pos_sentences_id_test = utils.map_string_2_id_close(pos_sentences_test, alphabet_pos) chunk_sentences_id_train, alphabet_chunk = utils.map_string_2_id_open(chunk_sentences_train, 'chunk') alphabet_chunk.save('pre-trained-model/ner', name='alphabet_chunk') chunk_sentences_id_dev = utils.map_string_2_id_close(chunk_sentences_dev, alphabet_chunk) chunk_sentences_id_test = utils.map_string_2_id_close(chunk_sentences_test, alphabet_chunk) label_sentences_id_train, alphabet_label = utils.map_string_2_id_open(label_sentences_train, 'ner') alphabet_label.save('pre-trained-model/ner', name='alphabet_label') label_sentences_id_dev = utils.map_string_2_id_close(label_sentences_dev, alphabet_label) label_sentences_id_test = utils.map_string_2_id_close(label_sentences_test, alphabet_label) word_train, label_train, mask_train = \ utils.construct_tensor_word(word_sentences_train, label_sentences_id_train, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) word_dev, label_dev, mask_dev = \ utils.construct_tensor_word(word_sentences_dev, label_sentences_id_dev, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) word_test, label_test, mask_test = \ utils.construct_tensor_word(word_sentences_test, label_sentences_id_test, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) pos_train = utils.construct_tensor_onehot(pos_sentences_id_train, max_length, alphabet_pos.size()) pos_dev = utils.construct_tensor_onehot(pos_sentences_id_dev, max_length, alphabet_pos.size()) pos_test = utils.construct_tensor_onehot(pos_sentences_id_test, max_length, alphabet_pos.size()) chunk_train = utils.construct_tensor_onehot(chunk_sentences_id_train, max_length, alphabet_chunk.size()) chunk_dev = utils.construct_tensor_onehot(chunk_sentences_id_dev, max_length, alphabet_chunk.size()) chunk_test = utils.construct_tensor_onehot(chunk_sentences_id_test, max_length, alphabet_chunk.size()) word_train = np.concatenate((word_train, pos_train), axis=2) word_train = np.concatenate((word_train, chunk_train), axis=2) word_dev = np.concatenate((word_dev, pos_dev), axis=2) word_dev = np.concatenate((word_dev, chunk_dev), axis=2) word_test = np.concatenate((word_test, pos_test), axis=2) word_test = np.concatenate((word_test, chunk_test), axis=2) alphabet_char = LabelEncoder('char') alphabet_char.get_index(word_end) index_sentences_train, max_char_length_train = utils.get_character_indexes(word_sentences_train, alphabet_char) alphabet_char.close() char_embedd_table = utils.build_char_embedd_table(char_embedd_dim, alphabet_char) alphabet_char.save('pre-trained-model/ner', name='alphabet_char') index_sentences_dev, max_char_length_dev = utils.get_character_indexes(word_sentences_dev, alphabet_char) index_sentences_test, max_char_length_test = utils.get_character_indexes(word_sentences_test, alphabet_char) max_char_length = max(max_char_length_train, max_char_length_dev, max_char_length_test) char_train = utils.construct_tensor_char(index_sentences_train, max_length, max_char_length, alphabet_char) char_dev = utils.construct_tensor_char(index_sentences_dev, max_length, max_char_length, alphabet_char) char_test = utils.construct_tensor_char(index_sentences_test, max_length, max_char_length, alphabet_char) num_labels = alphabet_label.size() - 1 num_data, _, embedd_dim_concat = word_train.shape # print(np.shape(word_train)) # print(np.shape(word_dev)) # print(np.shape(word_test)) # print(np.shape(char_train)) # print(np.shape(char_dev)) # print(np.shape(char_test)) # print(np.shape(mask_train)) # print(np.shape(mask_dev)) # print(np.shape(mask_test)) # print(np.shape(label_train)) # print(np.shape(label_dev)) # print(np.shape(label_test)) # print(word_train[-1]) np.save("word_train.npy",word_train) np.save("label_train.npy", label_train) print("Done") return word_train, word_dev, word_test, char_train, char_dev, char_test, mask_train, mask_dev, mask_test, \ label_train, label_dev, label_test, alphabet_label, alphabet_char, max_length, max_char_length, \ char_embedd_table, num_labels, num_data, embedd_dim_concat