def load_data_ner(sents, pos_sents, chunk_sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim, max_sent_length, max_char_length, alphabet_char, alphabet_pos, alphabet_chunk): words, masks = utils.construct_tensor_word(sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim, max_sent_length) index_poss = utils.map_string_2_id_close(pos_sents, alphabet_pos) poss = utils.construct_tensor_onehot(index_poss, max_sent_length, alphabet_pos.size()) index_chunks = utils.map_string_2_id_close(chunk_sents, alphabet_chunk) chunks = utils.construct_tensor_onehot(index_chunks, max_sent_length, alphabet_chunk.size()) words = np.concatenate((words, poss), axis=2) words = np.concatenate((words, chunks), axis=2) index_chars = utils.get_character_indexes(sents, alphabet_char, max_char_length) chars = utils.construct_tensor_char(index_chars, max_sent_length, max_char_length, alphabet_char) return words, masks, chars
def create_data_2_train(train_path, dev_path, test_path, char_embedd_dim): word_sentences_train, pos_sentences_train, label_sentences_train = load_conll_data( train_path) word_sentences_dev, pos_sentences_dev, label_sentences_dev = load_conll_data( dev_path) word_sentences_test, pos_sentences_test, label_sentences_test = load_conll_data( test_path) max_length_train = utils.get_max_length(word_sentences_train) max_length_dev = utils.get_max_length(word_sentences_dev) max_length_test = utils.get_max_length(word_sentences_test) max_length = max(max_length_train, max_length_dev, max_length_test) pos_sentences_id_train, alphabet_pos = utils.map_string_2_id_open( pos_sentences_train, 'pos') alphabet_pos.save('pre-trained-model/chunk', name='alphabet_pos') pos_sentences_id_dev = utils.map_string_2_id_close(pos_sentences_dev, alphabet_pos) pos_sentences_id_test = utils.map_string_2_id_close( pos_sentences_test, alphabet_pos) label_sentences_id_train, alphabet_label = utils.map_string_2_id_open( label_sentences_train, 'chunk') alphabet_label.save('pre-trained-model/chunk', name='alphabet_label') label_sentences_id_dev = utils.map_string_2_id_close( label_sentences_dev, alphabet_label) label_sentences_id_test = utils.map_string_2_id_close( label_sentences_test, alphabet_label) word_train, label_train, mask_train = \ utils.construct_tensor_word(word_sentences_train, label_sentences_id_train, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) word_dev, label_dev, mask_dev = \ utils.construct_tensor_word(word_sentences_dev, label_sentences_id_dev, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) word_test, label_test, mask_test = \ utils.construct_tensor_word(word_sentences_test, label_sentences_id_test, unknown_embedd, embedding_words, embedding_vectors, embedd_dim, max_length) pos_train = utils.construct_tensor_onehot(pos_sentences_id_train, max_length, alphabet_pos.size()) pos_dev = utils.construct_tensor_onehot(pos_sentences_id_dev, max_length, alphabet_pos.size()) pos_test = utils.construct_tensor_onehot(pos_sentences_id_test, max_length, alphabet_pos.size()) word_train = np.concatenate((word_train, pos_train), axis=2) word_dev = np.concatenate((word_dev, pos_dev), axis=2) word_test = np.concatenate((word_test, pos_test), axis=2) alphabet_char = LabelEncoder('char') alphabet_char.get_index(word_end) index_sentences_train, max_char_length_train = utils.get_character_indexes( word_sentences_train, alphabet_char) alphabet_char.close() char_embedd_table = utils.build_char_embedd_table(char_embedd_dim, alphabet_char) alphabet_char.save('pre-trained-model/chunk', name='alphabet_char') index_sentences_dev, max_char_length_dev = utils.get_character_indexes( word_sentences_dev, alphabet_char) index_sentences_test, max_char_length_test = utils.get_character_indexes( word_sentences_test, alphabet_char) max_char_length = max(max_char_length_train, max_char_length_dev, max_char_length_test) char_train = utils.construct_tensor_char(index_sentences_train, max_length, max_char_length, alphabet_char) char_dev = utils.construct_tensor_char(index_sentences_dev, max_length, max_char_length, alphabet_char) char_test = utils.construct_tensor_char(index_sentences_test, max_length, max_char_length, alphabet_char) num_labels = alphabet_label.size() - 1 num_data, _, embedd_dim_concat = word_train.shape return word_train, word_dev, word_test, char_train, char_dev, char_test, mask_train, mask_dev, mask_test, \ label_train, label_dev, label_test, alphabet_label, alphabet_char, max_length, max_char_length, \ char_embedd_table, num_labels, num_data, embedd_dim_concat