예제 #1
0
    def generate_dataset_not_fine_tune():
        """
        generate data tensor when not fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
            embedding, embedding_path, word_alphabet, logger)
        logger.info("Dimension of embedding is %d, Caseless: %s" %
                    (embedd_dim, caseless))

        # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length])
        unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim])
        X_train, Y_train, mask_train = construct_tensor_not_fine_tune(
            word_sentences_train, label_index_sentences_train, unknown_embedd,
            embedd_dict, embedd_dim, caseless)
        X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(
            word_sentences_dev, label_index_sentences_dev, unknown_embedd,
            embedd_dict, embedd_dim, caseless)
        X_test, Y_test, mask_test = construct_tensor_not_fine_tune(
            word_sentences_test, label_index_sentences_test, unknown_embedd,
            embedd_dict, embedd_dim, caseless)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(
            word_sentences_train, word_sentences_dev, word_sentences_test,
            max_length) if use_character else (None, None, None, None)

        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               None, label_alphabet, C_train, C_dev, C_test, char_embedd_table
예제 #2
0
    def generate_dataset_not_fine_tune():
        """
        generate data tensor when not fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                           logger)
        logger.info("Dimension of embedding is %d, Caseless: %s" % (embedd_dim, caseless))

        # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length])
        unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim])
        X_train, Y_train, mask_train = construct_tensor_not_fine_tune(word_sentences_train,
                                                                      label_index_sentences_train, unknown_embedd,
                                                                      embedd_dict, embedd_dim, caseless)
        X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(word_sentences_dev, label_index_sentences_dev,
                                                                unknown_embedd, embedd_dict, embedd_dim, caseless)
        X_test, Y_test, mask_test = construct_tensor_not_fine_tune(word_sentences_test, label_index_sentences_test,
                                                                   unknown_embedd, embedd_dict, embedd_dim, caseless)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                            word_sentences_test,
                                                                            max_length) if use_character else (
            None, None, None, None)

        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               None, label_alphabet, C_train, C_dev, C_test, char_embedd_table
예제 #3
0
    def generate_dataset_fine_tune():
        """
        generate data tensor when fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
            embedding, embedding_path, word_alphabet, logger)
        orth_word_embedding, orth_embedd_dim = utils.load_ortho_word_embedding_dict(
            word_orth_alphabet, logger)
        logger.info("Dimension of embedding is %d, Caseless: %d" %
                    (embedd_dim, caseless))
        # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length])
        X_train, Y_train, mask_train = construct_tensor_fine_tune(
            word_index_sentences_train, label_index_sentences_train)
        X_train_orth = construct_orth_tensor_fine_tune(
            orth_index_sentences_train)
        X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(
            word_index_sentences_dev, label_index_sentences_dev)
        X_dev_orth = construct_orth_tensor_fine_tune(orth_index_sentences_dev)
        X_test, Y_test, mask_test = construct_tensor_fine_tune(
            word_index_sentences_test, label_index_sentences_test)
        X_test_orth = construct_orth_tensor_fine_tune(
            orth_index_sentences_test)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(
            word_sentences_train, word_sentences_dev, word_sentences_test,
            max_length) if use_character else (None, None, None, None)
        orth_C_train, orth_C_dev, orth_C_test, orth_char_embedd_table = generate_character_data(
            orth_sentences_train, orth_sentences_dev, orth_sentences_test,
            max_length) if use_character else (None, None, None, None)
        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless), label_alphabet, \
               C_train, C_dev, C_test, char_embedd_table
예제 #4
0
    def generate_dataset_fine_tune():
        """
        generate data tensor when fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                           logger)
        logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless))
        # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length])
        X_train, Y_train, mask_train = construct_tensor_fine_tune(word_index_sentences_train,
                                                                  label_index_sentences_train)
        X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(word_index_sentences_dev, label_index_sentences_dev)
        X_test, Y_test, mask_test = construct_tensor_fine_tune(word_index_sentences_test, label_index_sentences_test)
        C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                            word_sentences_test,
                                                                            max_length) if use_character else (
            None, None, None, None)
        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               build_embedd_table(embedd_dict, embedd_dim, caseless), label_alphabet, \
               C_train, C_dev, C_test, char_embedd_table
예제 #5
0
def load_dataset_parsing(train_path,
                         dev_path,
                         test_path,
                         word_column=1,
                         pos_column=4,
                         head_column=6,
                         type_column=7,
                         embedding="word2Vec",
                         embedding_path=None):
    """

    load data from file
    :param train_path: path of training file
    :param dev_path: path of dev file
    :param test_path: path of test file
    :param word_column: the column index of word (start from 0)
    :param pos_column: the column index of pos (start from 0)
    :param head_column: the column index of head (start from 0)
    :param type_column: the column index of types (start from 0)
    :param embedding: embeddings for words, choose from ['word2vec', 'senna'].
    :param embedding_path: path of file storing word embeddings.
    :return: X_train, POS_train, Head_train, Type_train, mask_train,
             X_dev, POS_dev, Head_dev, Type_dev, mask_dev,
             X_test, POS_test, Head_test, Type_test, mask_test,
             embedd_table, word_alphabet, pos_alphabet, type_alphabet, C_train, C_dev, C_test, char_embedd_table
    """
    def construct_tensor(word_index_sentences, pos_index_sentences,
                         head_sentences, type_index_sentences):
        X = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        POS = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Head = np.empty([len(word_index_sentences), max_length],
                        dtype=np.int32)
        Type = np.empty([len(word_index_sentences), max_length],
                        dtype=np.int32)
        mask = np.zeros([len(word_index_sentences), max_length],
                        dtype=theano.config.floatX)

        for i in range(len(word_index_sentences)):
            word_ids = word_index_sentences[i]
            pos_ids = pos_index_sentences[i]
            heads = head_sentences[i]
            type_ids = type_index_sentences[i]
            length = len(word_ids)
            for j in range(length):
                wid = word_ids[j]
                pid = pos_ids[j]
                head = heads[j]
                tid = type_ids[j]
                X[i, j] = wid
                POS[i, j] = pid - 1
                Head[i, j] = head
                Type[i, j] = tid - 1

            # Zero out X after the end of the sequence
            X[i, length:] = 0
            # Copy the last label after the end of the sequence
            POS[i, length:] = POS[i, length - 1]
            Head[i, length:] = Head[i, length - 1]
            Type[i, length:] = Type[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1
        return X, POS, Head, Type, mask

    word_alphabet = Alphabet('word')
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')

    # read training data
    logger.info("Reading data from training set...")
    word_sentences_train, pos_sentences_train, head_sentences_train, type_sentence_train, \
    word_index_sentences_train, pos_index_sentences_train, \
    type_index_sentences_train = read_conll_parsing(train_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                    pos_column, head_column, type_column)

    # read dev data
    logger.info("Reading data from dev set...")
    word_sentences_dev, pos_sentences_dev, head_sentences_dev, type_sentence_dev, \
    word_index_sentences_dev, pos_index_sentences_dev, \
    type_index_sentences_dev = read_conll_parsing(dev_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                  pos_column, head_column, type_column)

    # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, pos_sentences_test, head_sentences_test, type_sentence_test, \
    word_index_sentences_test, pos_index_sentences_test, \
    type_index_sentences_test = read_conll_parsing(test_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                   pos_column, head_column, type_column)

    # close alphabets
    word_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()

    logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
    logger.info("pos alphabet size: %d" % (pos_alphabet.size() - 1))
    logger.info("type alphabet size: %d" % (type_alphabet.size() - 1))

    # get maximum length
    max_length_train = get_max_length(word_sentences_train)
    max_length_dev = get_max_length(word_sentences_dev)
    max_length_test = get_max_length(word_sentences_test)
    max_length = min(MAX_LENGTH,
                     max(max_length_train, max_length_dev, max_length_test))
    logger.info("Maximum length of training set is %d" % max_length_train)
    logger.info("Maximum length of dev set is %d" % max_length_dev)
    logger.info("Maximum length of test set is %d" % max_length_test)
    logger.info("Maximum length used for training is %d" % max_length)

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
        embedding, embedding_path, word_alphabet, logger)
    logger.info("Dimension of embedding is %d, Caseless: %d" %
                (embedd_dim, caseless))
    # fill data tensor (X.shape = [#data, max_length], {POS, Head, Type}.shape = [#data, max_length])
    X_train, POS_train, Head_train, Type_train, mask_train = construct_tensor(
        word_index_sentences_train, pos_index_sentences_train,
        head_sentences_train, type_index_sentences_train)

    X_dev, POS_dev, Head_dev, Type_dev, mask_dev = construct_tensor(
        word_index_sentences_dev, pos_index_sentences_dev, head_sentences_dev,
        type_index_sentences_dev)

    X_test, POS_test, Head_test, Type_test, mask_test = construct_tensor(
        word_index_sentences_test, pos_index_sentences_test,
        head_sentences_test, type_index_sentences_test)

    embedd_table = build_embedd_table(word_alphabet, embedd_dict, embedd_dim,
                                      caseless)

    C_train, C_dev, C_test, char_embedd_table = generate_character_data(
        word_sentences_train, word_sentences_dev, word_sentences_test,
        max_length)

    return X_train, POS_train, Head_train, Type_train, mask_train, \
           X_dev, POS_dev, Head_dev, Type_dev, mask_dev, \
           X_test, POS_test, Head_test, Type_test, mask_test, \
           embedd_table, word_alphabet, pos_alphabet, type_alphabet, \
           C_train, C_dev, C_test, char_embedd_table
예제 #6
0
def load_dataset_parsing(train_path, dev_path, test_path, word_column=1, pos_column=4, head_column=6, type_column=7,
                         embedding="word2Vec", embedding_path=None):
    """

    load data from file
    :param train_path: path of training file
    :param dev_path: path of dev file
    :param test_path: path of test file
    :param word_column: the column index of word (start from 0)
    :param pos_column: the column index of pos (start from 0)
    :param head_column: the column index of head (start from 0)
    :param type_column: the column index of types (start from 0)
    :param embedding: embeddings for words, choose from ['word2vec', 'senna'].
    :param embedding_path: path of file storing word embeddings.
    :return: X_train, POS_train, Head_train, Type_train, mask_train,
             X_dev, POS_dev, Head_dev, Type_dev, mask_dev,
             X_test, POS_test, Head_test, Type_test, mask_test,
             embedd_table, word_alphabet, pos_alphabet, type_alphabet, C_train, C_dev, C_test, char_embedd_table
    """

    def construct_tensor(word_index_sentences, pos_index_sentences, head_sentences, type_index_sentences):
        X = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        POS = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Head = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Type = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX)

        for i in range(len(word_index_sentences)):
            word_ids = word_index_sentences[i]
            pos_ids = pos_index_sentences[i]
            heads = head_sentences[i]
            type_ids = type_index_sentences[i]
            length = len(word_ids)
            for j in range(length):
                wid = word_ids[j]
                pid = pos_ids[j]
                head = heads[j]
                tid = type_ids[j]
                X[i, j] = wid
                POS[i, j] = pid - 1
                Head[i, j] = head
                Type[i, j] = tid - 1

            # Zero out X after the end of the sequence
            X[i, length:] = 0
            # Copy the last label after the end of the sequence
            POS[i, length:] = POS[i, length - 1]
            Head[i, length:] = Head[i, length - 1]
            Type[i, length:] = Type[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1
        return X, POS, Head, Type, mask

    word_alphabet = Alphabet('word')
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')

    # read training data
    logger.info("Reading data from training set...")
    word_sentences_train, pos_sentences_train, head_sentences_train, type_sentence_train, \
    word_index_sentences_train, pos_index_sentences_train, \
    type_index_sentences_train = read_conll_parsing(train_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                    pos_column, head_column, type_column)

    # read dev data
    logger.info("Reading data from dev set...")
    word_sentences_dev, pos_sentences_dev, head_sentences_dev, type_sentence_dev, \
    word_index_sentences_dev, pos_index_sentences_dev, \
    type_index_sentences_dev = read_conll_parsing(dev_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                  pos_column, head_column, type_column)

    # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, pos_sentences_test, head_sentences_test, type_sentence_test, \
    word_index_sentences_test, pos_index_sentences_test, \
    type_index_sentences_test = read_conll_parsing(test_path, word_alphabet, pos_alphabet, type_alphabet, word_column,
                                                   pos_column, head_column, type_column)

    # close alphabets
    word_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()

    logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
    logger.info("pos alphabet size: %d" % (pos_alphabet.size() - 1))
    logger.info("type alphabet size: %d" % (type_alphabet.size() - 1))

    # get maximum length
    max_length_train = get_max_length(word_sentences_train)
    max_length_dev = get_max_length(word_sentences_dev)
    max_length_test = get_max_length(word_sentences_test)
    max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test))
    logger.info("Maximum length of training set is %d" % max_length_train)
    logger.info("Maximum length of dev set is %d" % max_length_dev)
    logger.info("Maximum length of test set is %d" % max_length_test)
    logger.info("Maximum length used for training is %d" % max_length)

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                       logger)
    logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless))
    # fill data tensor (X.shape = [#data, max_length], {POS, Head, Type}.shape = [#data, max_length])
    X_train, POS_train, Head_train, Type_train, mask_train = construct_tensor(word_index_sentences_train,
                                                                              pos_index_sentences_train,
                                                                              head_sentences_train,
                                                                              type_index_sentences_train)

    X_dev, POS_dev, Head_dev, Type_dev, mask_dev = construct_tensor(word_index_sentences_dev,
                                                                    pos_index_sentences_dev,
                                                                    head_sentences_dev,
                                                                    type_index_sentences_dev)

    X_test, POS_test, Head_test, Type_test, mask_test = construct_tensor(word_index_sentences_test,
                                                                         pos_index_sentences_test,
                                                                         head_sentences_test,
                                                                         type_index_sentences_test)

    embedd_table = build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless)

    C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                        word_sentences_test, max_length)

    return X_train, POS_train, Head_train, Type_train, mask_train, \
           X_dev, POS_dev, Head_dev, Type_dev, mask_dev, \
           X_test, POS_test, Head_test, Type_test, mask_test, \
           embedd_table, word_alphabet, pos_alphabet, type_alphabet, \
           C_train, C_dev, C_test, char_embedd_table