示例#1
0
def load_data_ner(sents, pos_sents, chunk_sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim,
                  max_sent_length, max_char_length, alphabet_char, alphabet_pos, alphabet_chunk):
    words, masks = utils.construct_tensor_word(sents, unknown_embedd, embedd_words, embedd_vectors, embedd_dim,
                                               max_sent_length)
    index_poss = utils.map_string_2_id_close(pos_sents, alphabet_pos)
    poss = utils.construct_tensor_onehot(index_poss, max_sent_length, alphabet_pos.size())
    index_chunks = utils.map_string_2_id_close(chunk_sents, alphabet_chunk)
    chunks = utils.construct_tensor_onehot(index_chunks, max_sent_length, alphabet_chunk.size())
    words = np.concatenate((words, poss), axis=2)
    words = np.concatenate((words, chunks), axis=2)
    index_chars = utils.get_character_indexes(sents, alphabet_char, max_char_length)
    chars = utils.construct_tensor_char(index_chars, max_sent_length, max_char_length, alphabet_char)
    return words, masks, chars
示例#2
0
文件: chunk.py 项目: vutrinhhuu/NNVLP
def create_data_2_train(train_path, dev_path, test_path, char_embedd_dim):
    word_sentences_train, pos_sentences_train, label_sentences_train = load_conll_data(
        train_path)
    word_sentences_dev, pos_sentences_dev, label_sentences_dev = load_conll_data(
        dev_path)
    word_sentences_test, pos_sentences_test, label_sentences_test = load_conll_data(
        test_path)
    max_length_train = utils.get_max_length(word_sentences_train)
    max_length_dev = utils.get_max_length(word_sentences_dev)
    max_length_test = utils.get_max_length(word_sentences_test)
    max_length = max(max_length_train, max_length_dev, max_length_test)

    pos_sentences_id_train, alphabet_pos = utils.map_string_2_id_open(
        pos_sentences_train, 'pos')
    alphabet_pos.save('pre-trained-model/chunk', name='alphabet_pos')
    pos_sentences_id_dev = utils.map_string_2_id_close(pos_sentences_dev,
                                                       alphabet_pos)
    pos_sentences_id_test = utils.map_string_2_id_close(
        pos_sentences_test, alphabet_pos)
    label_sentences_id_train, alphabet_label = utils.map_string_2_id_open(
        label_sentences_train, 'chunk')
    alphabet_label.save('pre-trained-model/chunk', name='alphabet_label')
    label_sentences_id_dev = utils.map_string_2_id_close(
        label_sentences_dev, alphabet_label)
    label_sentences_id_test = utils.map_string_2_id_close(
        label_sentences_test, alphabet_label)
    word_train, label_train, mask_train = \
        utils.construct_tensor_word(word_sentences_train, label_sentences_id_train, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    word_dev, label_dev, mask_dev = \
        utils.construct_tensor_word(word_sentences_dev, label_sentences_id_dev, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    word_test, label_test, mask_test = \
        utils.construct_tensor_word(word_sentences_test, label_sentences_id_test, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    pos_train = utils.construct_tensor_onehot(pos_sentences_id_train,
                                              max_length, alphabet_pos.size())
    pos_dev = utils.construct_tensor_onehot(pos_sentences_id_dev, max_length,
                                            alphabet_pos.size())
    pos_test = utils.construct_tensor_onehot(pos_sentences_id_test, max_length,
                                             alphabet_pos.size())
    word_train = np.concatenate((word_train, pos_train), axis=2)
    word_dev = np.concatenate((word_dev, pos_dev), axis=2)
    word_test = np.concatenate((word_test, pos_test), axis=2)

    alphabet_char = LabelEncoder('char')
    alphabet_char.get_index(word_end)
    index_sentences_train, max_char_length_train = utils.get_character_indexes(
        word_sentences_train, alphabet_char)
    alphabet_char.close()
    char_embedd_table = utils.build_char_embedd_table(char_embedd_dim,
                                                      alphabet_char)
    alphabet_char.save('pre-trained-model/chunk', name='alphabet_char')
    index_sentences_dev, max_char_length_dev = utils.get_character_indexes(
        word_sentences_dev, alphabet_char)
    index_sentences_test, max_char_length_test = utils.get_character_indexes(
        word_sentences_test, alphabet_char)
    max_char_length = max(max_char_length_train, max_char_length_dev,
                          max_char_length_test)
    char_train = utils.construct_tensor_char(index_sentences_train, max_length,
                                             max_char_length, alphabet_char)
    char_dev = utils.construct_tensor_char(index_sentences_dev, max_length,
                                           max_char_length, alphabet_char)
    char_test = utils.construct_tensor_char(index_sentences_test, max_length,
                                            max_char_length, alphabet_char)
    num_labels = alphabet_label.size() - 1
    num_data, _, embedd_dim_concat = word_train.shape
    return word_train, word_dev, word_test, char_train, char_dev, char_test, mask_train, mask_dev, mask_test, \
           label_train, label_dev, label_test, alphabet_label, alphabet_char, max_length, max_char_length, \
           char_embedd_table, num_labels, num_data, embedd_dim_concat