def generate_dataset_not_fine_tune(): """ generate data tensor when not fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %s" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length]) unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim]) X_train, Y_train, mask_train = construct_tensor_not_fine_tune( word_sentences_train, label_index_sentences_train, unknown_embedd, embedd_dict, embedd_dim, caseless) X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune( word_sentences_dev, label_index_sentences_dev, unknown_embedd, embedd_dict, embedd_dim, caseless) X_test, Y_test, mask_test = construct_tensor_not_fine_tune( word_sentences_test, label_index_sentences_test, unknown_embedd, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data( word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else (None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ None, label_alphabet, C_train, C_dev, C_test, char_embedd_table
def generate_dataset_not_fine_tune(): """ generate data tensor when not fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %s" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length]) unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim]) X_train, Y_train, mask_train = construct_tensor_not_fine_tune(word_sentences_train, label_index_sentences_train, unknown_embedd, embedd_dict, embedd_dim, caseless) X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(word_sentences_dev, label_index_sentences_dev, unknown_embedd, embedd_dict, embedd_dim, caseless) X_test, Y_test, mask_test = construct_tensor_not_fine_tune(word_sentences_test, label_index_sentences_test, unknown_embedd, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else ( None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ None, label_alphabet, C_train, C_dev, C_test, char_embedd_table
def generate_dataset_fine_tune(): """ generate data tensor when fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( embedding, embedding_path, word_alphabet, logger) orth_word_embedding, orth_embedd_dim = utils.load_ortho_word_embedding_dict( word_orth_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length]) X_train, Y_train, mask_train = construct_tensor_fine_tune( word_index_sentences_train, label_index_sentences_train) X_train_orth = construct_orth_tensor_fine_tune( orth_index_sentences_train) X_dev, Y_dev, mask_dev = construct_tensor_fine_tune( word_index_sentences_dev, label_index_sentences_dev) X_dev_orth = construct_orth_tensor_fine_tune(orth_index_sentences_dev) X_test, Y_test, mask_test = construct_tensor_fine_tune( word_index_sentences_test, label_index_sentences_test) X_test_orth = construct_orth_tensor_fine_tune( orth_index_sentences_test) C_train, C_dev, C_test, char_embedd_table = generate_character_data( word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else (None, None, None, None) orth_C_train, orth_C_dev, orth_C_test, orth_char_embedd_table = generate_character_data( orth_sentences_train, orth_sentences_dev, orth_sentences_test, max_length) if use_character else (None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless), label_alphabet, \ C_train, C_dev, C_test, char_embedd_table
def generate_dataset_fine_tune(): """ generate data tensor when fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length]) X_train, Y_train, mask_train = construct_tensor_fine_tune(word_index_sentences_train, label_index_sentences_train) X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(word_index_sentences_dev, label_index_sentences_dev) X_test, Y_test, mask_test = construct_tensor_fine_tune(word_index_sentences_test, label_index_sentences_test) C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else ( None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ build_embedd_table(embedd_dict, embedd_dim, caseless), label_alphabet, \ C_train, C_dev, C_test, char_embedd_table
def load_dataset_parsing(train_path, dev_path, test_path, word_column=1, pos_column=4, head_column=6, type_column=7, embedding="word2Vec", embedding_path=None): """ load data from file :param train_path: path of training file :param dev_path: path of dev file :param test_path: path of test file :param word_column: the column index of word (start from 0) :param pos_column: the column index of pos (start from 0) :param head_column: the column index of head (start from 0) :param type_column: the column index of types (start from 0) :param embedding: embeddings for words, choose from ['word2vec', 'senna']. :param embedding_path: path of file storing word embeddings. :return: X_train, POS_train, Head_train, Type_train, mask_train, X_dev, POS_dev, Head_dev, Type_dev, mask_dev, X_test, POS_test, Head_test, Type_test, mask_test, embedd_table, word_alphabet, pos_alphabet, type_alphabet, C_train, C_dev, C_test, char_embedd_table """ def construct_tensor(word_index_sentences, pos_index_sentences, head_sentences, type_index_sentences): X = np.empty([len(word_index_sentences), max_length], dtype=np.int32) POS = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Head = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Type = np.empty([len(word_index_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX) for i in range(len(word_index_sentences)): word_ids = word_index_sentences[i] pos_ids = pos_index_sentences[i] heads = head_sentences[i] type_ids = type_index_sentences[i] length = len(word_ids) for j in range(length): wid = word_ids[j] pid = pos_ids[j] head = heads[j] tid = type_ids[j] X[i, j] = wid POS[i, j] = pid - 1 Head[i, j] = head Type[i, j] = tid - 1 # Zero out X after the end of the sequence X[i, length:] = 0 # Copy the last label after the end of the sequence POS[i, length:] = POS[i, length - 1] Head[i, length:] = Head[i, length - 1] Type[i, length:] = Type[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 return X, POS, Head, Type, mask word_alphabet = Alphabet('word') pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') # read training data logger.info("Reading data from training set...") word_sentences_train, pos_sentences_train, head_sentences_train, type_sentence_train, \ word_index_sentences_train, pos_index_sentences_train, \ type_index_sentences_train = read_conll_parsing(train_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # read dev data logger.info("Reading data from dev set...") word_sentences_dev, pos_sentences_dev, head_sentences_dev, type_sentence_dev, \ word_index_sentences_dev, pos_index_sentences_dev, \ type_index_sentences_dev = read_conll_parsing(dev_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # read test data logger.info("Reading data from test set...") word_sentences_test, pos_sentences_test, head_sentences_test, type_sentence_test, \ word_index_sentences_test, pos_index_sentences_test, \ type_index_sentences_test = read_conll_parsing(test_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # close alphabets word_alphabet.close() pos_alphabet.close() type_alphabet.close() logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("pos alphabet size: %d" % (pos_alphabet.size() - 1)) logger.info("type alphabet size: %d" % (type_alphabet.size() - 1)) # get maximum length max_length_train = get_max_length(word_sentences_train) max_length_dev = get_max_length(word_sentences_dev) max_length_test = get_max_length(word_sentences_test) max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test)) logger.info("Maximum length of training set is %d" % max_length_train) logger.info("Maximum length of dev set is %d" % max_length_dev) logger.info("Maximum length of test set is %d" % max_length_test) logger.info("Maximum length used for training is %d" % max_length) embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], {POS, Head, Type}.shape = [#data, max_length]) X_train, POS_train, Head_train, Type_train, mask_train = construct_tensor( word_index_sentences_train, pos_index_sentences_train, head_sentences_train, type_index_sentences_train) X_dev, POS_dev, Head_dev, Type_dev, mask_dev = construct_tensor( word_index_sentences_dev, pos_index_sentences_dev, head_sentences_dev, type_index_sentences_dev) X_test, POS_test, Head_test, Type_test, mask_test = construct_tensor( word_index_sentences_test, pos_index_sentences_test, head_sentences_test, type_index_sentences_test) embedd_table = build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data( word_sentences_train, word_sentences_dev, word_sentences_test, max_length) return X_train, POS_train, Head_train, Type_train, mask_train, \ X_dev, POS_dev, Head_dev, Type_dev, mask_dev, \ X_test, POS_test, Head_test, Type_test, mask_test, \ embedd_table, word_alphabet, pos_alphabet, type_alphabet, \ C_train, C_dev, C_test, char_embedd_table
def load_dataset_parsing(train_path, dev_path, test_path, word_column=1, pos_column=4, head_column=6, type_column=7, embedding="word2Vec", embedding_path=None): """ load data from file :param train_path: path of training file :param dev_path: path of dev file :param test_path: path of test file :param word_column: the column index of word (start from 0) :param pos_column: the column index of pos (start from 0) :param head_column: the column index of head (start from 0) :param type_column: the column index of types (start from 0) :param embedding: embeddings for words, choose from ['word2vec', 'senna']. :param embedding_path: path of file storing word embeddings. :return: X_train, POS_train, Head_train, Type_train, mask_train, X_dev, POS_dev, Head_dev, Type_dev, mask_dev, X_test, POS_test, Head_test, Type_test, mask_test, embedd_table, word_alphabet, pos_alphabet, type_alphabet, C_train, C_dev, C_test, char_embedd_table """ def construct_tensor(word_index_sentences, pos_index_sentences, head_sentences, type_index_sentences): X = np.empty([len(word_index_sentences), max_length], dtype=np.int32) POS = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Head = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Type = np.empty([len(word_index_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX) for i in range(len(word_index_sentences)): word_ids = word_index_sentences[i] pos_ids = pos_index_sentences[i] heads = head_sentences[i] type_ids = type_index_sentences[i] length = len(word_ids) for j in range(length): wid = word_ids[j] pid = pos_ids[j] head = heads[j] tid = type_ids[j] X[i, j] = wid POS[i, j] = pid - 1 Head[i, j] = head Type[i, j] = tid - 1 # Zero out X after the end of the sequence X[i, length:] = 0 # Copy the last label after the end of the sequence POS[i, length:] = POS[i, length - 1] Head[i, length:] = Head[i, length - 1] Type[i, length:] = Type[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 return X, POS, Head, Type, mask word_alphabet = Alphabet('word') pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') # read training data logger.info("Reading data from training set...") word_sentences_train, pos_sentences_train, head_sentences_train, type_sentence_train, \ word_index_sentences_train, pos_index_sentences_train, \ type_index_sentences_train = read_conll_parsing(train_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # read dev data logger.info("Reading data from dev set...") word_sentences_dev, pos_sentences_dev, head_sentences_dev, type_sentence_dev, \ word_index_sentences_dev, pos_index_sentences_dev, \ type_index_sentences_dev = read_conll_parsing(dev_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # read test data logger.info("Reading data from test set...") word_sentences_test, pos_sentences_test, head_sentences_test, type_sentence_test, \ word_index_sentences_test, pos_index_sentences_test, \ type_index_sentences_test = read_conll_parsing(test_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # close alphabets word_alphabet.close() pos_alphabet.close() type_alphabet.close() logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("pos alphabet size: %d" % (pos_alphabet.size() - 1)) logger.info("type alphabet size: %d" % (type_alphabet.size() - 1)) # get maximum length max_length_train = get_max_length(word_sentences_train) max_length_dev = get_max_length(word_sentences_dev) max_length_test = get_max_length(word_sentences_test) max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test)) logger.info("Maximum length of training set is %d" % max_length_train) logger.info("Maximum length of dev set is %d" % max_length_dev) logger.info("Maximum length of test set is %d" % max_length_test) logger.info("Maximum length used for training is %d" % max_length) embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], {POS, Head, Type}.shape = [#data, max_length]) X_train, POS_train, Head_train, Type_train, mask_train = construct_tensor(word_index_sentences_train, pos_index_sentences_train, head_sentences_train, type_index_sentences_train) X_dev, POS_dev, Head_dev, Type_dev, mask_dev = construct_tensor(word_index_sentences_dev, pos_index_sentences_dev, head_sentences_dev, type_index_sentences_dev) X_test, POS_test, Head_test, Type_test, mask_test = construct_tensor(word_index_sentences_test, pos_index_sentences_test, head_sentences_test, type_index_sentences_test) embedd_table = build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) return X_train, POS_train, Head_train, Type_train, mask_train, \ X_dev, POS_dev, Head_dev, Type_dev, mask_dev, \ X_test, POS_test, Head_test, Type_test, mask_test, \ embedd_table, word_alphabet, pos_alphabet, type_alphabet, \ C_train, C_dev, C_test, char_embedd_table