Exemplo n.º 1
0
def get_args():
    parser = argparse.ArgumentParser()

    # corpus parameter
    word_average_length = 3
    max_token_length = 5
    parser.add_argument('--max_token_length',
                        type=int,
                        default=max_token_length,
                        help='the size of one hidden layer')

    parser.add_argument('--embedding_size',
                        type=int,
                        default=50,
                        help='the size of one hidden layer')

    _, _, _, _, tokens_number, _ = zhwiki_corpus.get_statistics()
    parser.add_argument('--corpus_size',
                        type=int,
                        default=tokens_number,
                        help='the number of word in train corpus')

    word_dic, _ = zhwiki_corpus.get_word_id_dictionaries()
    vocabulary_size = len(word_dic)
    parser.add_argument('--vocabulary_size',
                        type=int,
                        default=vocabulary_size,
                        help='the size of vocabulary')

    character_dic, _ = zhwiki_corpus.get_character_id_dictionaries()
    character_size = len(character_dic)
    parser.add_argument('--character_size',
                        type=int,
                        default=character_size,
                        help='the size of character')

    pinyin_id = cc_phonology.get_pinyin_id_dic()
    pinyin_size = len(pinyin_id)
    parser.add_argument('--pinyin_size',
                        type=int,
                        default=pinyin_size,
                        help='the size of pinyin')

    id_initials, id_finals, id_tones = cc_phonology.get_id_other_dic()
    initial_size, final_size, tone_size = len(id_initials), len(
        id_finals), len(id_tones)
    parser.add_argument('--initial_size',
                        type=int,
                        default=initial_size,
                        help='the size of initial')
    parser.add_argument('--final_size',
                        type=int,
                        default=final_size,
                        help='the size of initial')
    parser.add_argument('--tone_size',
                        type=int,
                        default=tone_size,
                        help='the size of initial')

    # model architecture hyper-parameter
    max_length = max_token_length * (word_average_length + 1) + 1
    parser.add_argument('--episode_size',
                        type=int,
                        default=max_length,
                        help='the length of each input (or output) sequence')
    parser.add_argument('--batch_norm',
                        type=bool,
                        default=False,
                        help='batch normalization')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=250,
                        help='the size of one hidden layer')
    parser.add_argument('--return_sequences',
                        type=bool,
                        default=True,
                        help='whether return sequences in RNN')
    parser.add_argument('--stateful',
                        type=bool,
                        default=True,
                        help='whether stateful in RNN')
    parser.add_argument('--l2',
                        type=float,
                        default=0.001,
                        help='L2 regularization')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.60,
                        help='dropout rate (between 0 and 1)')

    # model training parameter
    batch_size = 256
    parser.add_argument('--batch_size',
                        type=int,
                        default=batch_size,
                        help='size of batch sample')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0001,
                        help='learning rate (between 0 and 1)')
    epochs = 5
    parser.add_argument('--epochs',
                        type=int,
                        default=epochs,
                        help='number of epochs')
    steps_per_epoch = tokens_number / (batch_size * max_token_length) / 100
    parser.add_argument('--steps_per_epoch',
                        type=int,
                        default=steps_per_epoch,
                        help='the number of batches for training')
    iterations = 20 * 200
    parser.add_argument('--iterations',
                        type=int,
                        default=iterations,
                        help='number of iterations')

    # model saving parameter
    model_file_train = MORPHONET_1_MODEL_FOLDER + "model_train.hdf5"
    parser.add_argument('--model_file_train',
                        type=str,
                        default=model_file_train,
                        help='load model from a file')
    model_file_test = MORPHONET_1_MODEL_FOLDER + "model_test.hdf5"
    parser.add_argument('--model_file_test',
                        type=str,
                        default=model_file_test,
                        help='load model from a file for testing')
    model_picture = FOLDER + "architecture.png"
    parser.add_argument('--model_picture',
                        type=str,
                        default=model_picture,
                        help='the file for saving the model architecture')
    parser.add_argument('--log_every',
                        type=int,
                        default=1,
                        help='print information every x iteration')
    parser.add_argument('--save_every',
                        type=int,
                        default=1,
                        help='save_sample_image state every x epoch')

    args = parser.parse_args()
    return args
Exemplo n.º 2
0
def get_args():
    parser = argparse.ArgumentParser()

    # corpus parameter
    word_average_length = 3
    context_window_size = 5
    parser.add_argument('--context_window_size',
                        type=int,
                        default=context_window_size,
                        help='the size of context window')

    parser.add_argument('--embedding_size',
                        type=int,
                        default=100,
                        help='the size of embeddings')

    _, _, _, _, tokens_number, _ = zhwiki_corpus.get_statistics()
    parser.add_argument('--corpus_size',
                        type=int,
                        default=tokens_number,
                        help='the number of word in train corpus')

    word_dic, _ = zhwiki_corpus.get_word_id_dictionaries()
    vocabulary_size = len(word_dic)
    parser.add_argument('--vocabulary_size',
                        type=int,
                        default=vocabulary_size,
                        help='the size of vocabulary')

    character_dic, _ = zhwiki_corpus.get_character_id_dictionaries()
    character_size = len(character_dic)
    parser.add_argument('--character_size',
                        type=int,
                        default=character_size,
                        help='the size of character')

    pinyin_id = cc_phonology.get_pinyin_id_dic()
    pinyin_size = len(pinyin_id)
    parser.add_argument('--pinyin_size',
                        type=int,
                        default=pinyin_size,
                        help='the size of pinyin')

    id_initials, id_finals, id_tones = cc_phonology.get_id_other_dic()
    initial_size, final_size, tone_size = len(id_initials), len(
        id_finals), len(id_tones)
    parser.add_argument('--initial_size',
                        type=int,
                        default=initial_size,
                        help='the size of initial')
    parser.add_argument('--final_size',
                        type=int,
                        default=final_size,
                        help='the size of initial')
    parser.add_argument('--tone_size',
                        type=int,
                        default=tone_size,
                        help='the size of initial')

    # model training parameter
    # batch_size = 5000
    batch_size = 2000
    # batch_size = 5
    parser.add_argument('--batch_size',
                        type=int,
                        default=batch_size,
                        help='size of batch sample')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0001,
                        help='learning rate (between 0 and 1)')
    epochs = 50
    # epochs = 5
    parser.add_argument('--epochs',
                        type=int,
                        default=epochs,
                        help='number of epochs')
    parser.add_argument("--gpus",
                        type=int,
                        default=2,
                        help="# of GPUs to use for training")

    steps_per_epoch = tokens_number / (batch_size *
                                       (2 * context_window_size + 1))
    # steps_per_epoch *= 5
    steps_per_epoch *= 2
    # steps_per_epoch = tokens_number / batch_size
    # steps_per_epoch = tokens_number / context_window_size / 100
    # steps_per_epoch = 10
    parser.add_argument('--steps_per_epoch',
                        type=int,
                        default=steps_per_epoch,
                        help='the number of batches for training')
    iterations = 20 * 200
    parser.add_argument('--iterations',
                        type=int,
                        default=iterations,
                        help='number of iterations')

    # model saving parametsder
    model_file_train = SKIP_GRAM_MODEL_FOLDER + "model_train_dimension_100.hdf5"
    parser.add_argument('--model_file_train',
                        type=str,
                        default=model_file_train,
                        help='load model from a file')
    model_file_test = SKIP_GRAM_MODEL_FOLDER + "model_test.hdf5"
    parser.add_argument('--model_file_test',
                        type=str,
                        default=model_file_test,
                        help='load model from a file for testing')
    model_picture = FOLDER + "architecture.png"
    parser.add_argument('--model_picture',
                        type=str,
                        default=model_picture,
                        help='the file for saving the model architecture')
    parser.add_argument('--log_every',
                        type=int,
                        default=1,
                        help='print information every x iteration')
    parser.add_argument('--save_every',
                        type=int,
                        default=1,
                        help='save_sample_image state every x epoch')

    args = parser.parse_args()
    return args
Exemplo n.º 3
0
def skip_gram_generator(folder=ZHWIKI_FOLDER_CACHE,
                        batch_size=5,
                        context_window_size=5,
                        negative_samples=5.):
    """Generate samples for training, testing, or validating language modeling
    on the Penn Treebank (PTB) dataset.

    # Arguments
        folder: the folder which contains all the corpora of training and
            testing and validation datasets of ptb.
        corpus_id: the id for denote train, test, or validation.
        batch_size: the size of one batch samples.
        context_window_size: size of context window.

    # Returns
        tuple of Numpy arrays:
        `(input_sequences, target_sequences)`.
    """
    # get the corpus_id_sequence of word id
    corpus_id_sequence = get_corpus_id_sequence(folder)
    # add by Robert Steven
    special_ids = get_special_ids()

    # get the size of vocabulary
    character_dic, id_character_dic = get_character_id_dictionaries()
    character_set = set(character_dic.keys())
    word_dic, id_word_dic = get_word_id_dictionaries()

    # get phonology knowledge
    character_pinyin_dic = cc_phonology.get_pinyins_of_character()
    fine_grained_pinyin_dic = cc_phonology.get_knowledge_of_fine_grained_pinyin(
    )
    pinyin_id = cc_phonology.get_pinyin_id_dic()
    pinyin_set = set(pinyin_id.keys())
    id_initials, id_finals, id_tones = cc_phonology.get_id_other_dic()

    # get morphology knowledge

    # initialize index
    start_idx = 0

    num_classes_words = len(word_dic)
    # num_classes_characters = len(character_dic)
    # num_classes_pinyins = len(pinyin_id)
    # num_classes_initials = len(id_initials)
    # num_classes_id_finals = len(id_finals)
    # num_classes_tones = len(id_tones)

    # loop for generating batch of samples
    while True:
        # get a batch of samples
        input_sequence = generate_input_sequences(corpus_id_sequence,
                                                  batch_size, start_idx,
                                                  context_window_size)

        data, labels = skipgrams(sequence=input_sequence,
                                 vocabulary_size=num_classes_words,
                                 window_size=context_window_size,
                                 negative_samples=negative_samples)

        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)

        # visualize_test_samples index
        # end_idx = start_idx + context_window_size
        # end_idx = start_idx + batch_size
        end_idx = start_idx + batch_size * (2 * context_window_size + 1)
        if end_idx > len(corpus_id_sequence):
            start_idx = end_idx - len(corpus_id_sequence)
        else:
            start_idx = end_idx

        # output a batch of samples
        yield (x, y)
Exemplo n.º 4
0
from keras.callbacks import Callback
import os
import sys

current_py_file = os.path.abspath(__file__)
sys.path.append(
    os.path.dirname(os.path.dirname(os.path.dirname(current_py_file))))

# from elwm.datasets.ptb import to_text
import morphonets.datasets.joint_evaluation as test
from morphonets.datasets import zhwiki_corpus
from morphonets.knowledge import cc_phonology

# get dictionaries
id_pinyins = cc_phonology.get_id_pinyin_dic()
id_initials, id_finals, id_tones = cc_phonology.get_id_other_dic()
_, id_word_dic = zhwiki_corpus.get_word_id_dictionaries()
dictionaries = (id_word_dic, id_pinyins, id_initials, id_finals, id_tones)


class Dashboard(Callback):
    """Create a dashboard for monitoring loss, accuracy, etc..., during the
    process of learning embedding model.

    # Arguments
        folder: the folder for saving sample picture file and statistic txt
            file.
        statistic_file: the txt file for saving loss, accuracy on training,
            testing and validation.
        model: a ANN model.
        watch_sample: whether watch prediction of samples during the training.
Exemplo n.º 5
0
def generator(folder=ZHWIKI_FOLDER_CACHE, batch_size=5, max_token_length=5):
    """Generate samples for training, testing, or validating language modeling
    on the Penn Treebank (PTB) dataset.

    # Arguments
        folder: the folder which contains all the corpora of training and
            testing and validation datasets of ptb.
        corpus_id: the id for denote train, test, or validation.
        batch_size: the size of one batch samples.
        max_token_length: length of corpus_id_sequence for each sample.

    # Returns
        tuple of Numpy arrays:
        `(input_sequences, target_sequences)`.
    """
    # get the corpus_id_sequence of word id
    corpus_id_sequence = get_corpus_id_sequence(folder)
    # add by Robert Steven
    special_ids = get_special_ids()

    # get the size of vocabulary
    character_dic, id_character_dic = get_character_id_dictionaries()
    character_set = set(character_dic.keys())
    word_dic, id_word_dic = get_word_id_dictionaries()

    # get phonology knowledge
    character_pinyin_dic = cc_phonology.get_pinyins_of_character()
    fine_grained_pinyin_dic = cc_phonology.get_knowledge_of_fine_grained_pinyin(
    )
    pinyin_id = cc_phonology.get_pinyin_id_dic()
    pinyin_set = set(pinyin_id.keys())
    id_initials, id_finals, id_tones = cc_phonology.get_id_other_dic()

    # get morphology knowledge

    # initialize index
    start_idx = 0

    max_length = max_token_length * 4 + 1
    num_classes_words = len(word_dic)
    num_classes_characters = len(character_dic)
    num_classes_pinyins = len(pinyin_id)
    num_classes_initials = len(id_initials)
    num_classes_id_finals = len(id_finals)
    num_classes_tones = len(id_tones)

    # loop for generating batch of samples
    while True:
        # get a batch of samples
        # input_sequences = generate_batch_input_sequences(
        #     corpus_id_sequence, start_idx, batch_size,
        #     max_token_length)

        # get a batch of special samples
        input_sequences = generate_special_batch_input_sequences(
            corpus_id_sequence, start_idx, batch_size, max_token_length,
            special_ids)

        target_sequences_character, \
            target_sequences_pinyin, \
            target_sequences_initial, \
            target_sequences_final, \
            target_sequences_tone = generate_batch_target_sequences(
                input_sequences,
                character_set,
                character_dic, id_character_dic,
                word_dic, id_word_dic,
                pinyin_id, pinyin_set,
                character_pinyin_dic, fine_grained_pinyin_dic)

        # visualize_test_samples index
        end_idx = start_idx + 1
        if end_idx > len(corpus_id_sequence):
            start_idx = end_idx - len(corpus_id_sequence)
        else:
            start_idx = end_idx

        input_sequences = pad_sequences(input_sequences,
                                        maxlen=max_length,
                                        padding='app',
                                        truncating='app')
        target_sequences_character = pad_sequences(target_sequences_character,
                                                   maxlen=max_length,
                                                   padding='app',
                                                   truncating='app')
        target_sequences_pinyin = pad_sequences(target_sequences_pinyin,
                                                maxlen=max_length,
                                                padding='app',
                                                truncating='app')
        target_sequences_initial = pad_sequences(target_sequences_initial,
                                                 maxlen=max_length,
                                                 padding='app',
                                                 truncating='app')
        target_sequences_final = pad_sequences(target_sequences_final,
                                               maxlen=max_length,
                                               padding='app',
                                               truncating='app')
        target_sequences_tone = pad_sequences(target_sequences_tone,
                                              maxlen=max_length,
                                              padding='app',
                                              truncating='app')

        input_sequences = to_tensor(input_sequences, max_length)
        target_sequences_character = to_categorical(target_sequences_character,
                                                    max_length,
                                                    num_classes_characters)
        target_sequences_pinyin = to_categorical(target_sequences_pinyin,
                                                 max_length,
                                                 num_classes_pinyins)
        target_sequences_initial = to_categorical(target_sequences_initial,
                                                  max_length,
                                                  num_classes_initials)
        target_sequences_final = to_categorical(target_sequences_final,
                                                max_length,
                                                num_classes_id_finals)
        target_sequences_tone = to_categorical(target_sequences_tone,
                                               max_length, num_classes_tones)

        # output a batch of samples
        yield (input_sequences, [
            target_sequences_character, target_sequences_pinyin,
            target_sequences_initial, target_sequences_final,
            target_sequences_tone
        ])