def get_args(): parser = argparse.ArgumentParser() # corpus parameter word_average_length = 3 max_token_length = 5 parser.add_argument('--max_token_length', type=int, default=max_token_length, help='the size of one hidden layer') parser.add_argument('--embedding_size', type=int, default=50, help='the size of one hidden layer') _, _, _, _, tokens_number, _ = zhwiki_corpus.get_statistics() parser.add_argument('--corpus_size', type=int, default=tokens_number, help='the number of word in train corpus') word_dic, _ = zhwiki_corpus.get_word_id_dictionaries() vocabulary_size = len(word_dic) parser.add_argument('--vocabulary_size', type=int, default=vocabulary_size, help='the size of vocabulary') character_dic, _ = zhwiki_corpus.get_character_id_dictionaries() character_size = len(character_dic) parser.add_argument('--character_size', type=int, default=character_size, help='the size of character') pinyin_id = cc_phonology.get_pinyin_id_dic() pinyin_size = len(pinyin_id) parser.add_argument('--pinyin_size', type=int, default=pinyin_size, help='the size of pinyin') id_initials, id_finals, id_tones = cc_phonology.get_id_other_dic() initial_size, final_size, tone_size = len(id_initials), len( id_finals), len(id_tones) parser.add_argument('--initial_size', type=int, default=initial_size, help='the size of initial') parser.add_argument('--final_size', type=int, default=final_size, help='the size of initial') parser.add_argument('--tone_size', type=int, default=tone_size, help='the size of initial') # model architecture hyper-parameter max_length = max_token_length * (word_average_length + 1) + 1 parser.add_argument('--episode_size', type=int, default=max_length, help='the length of each input (or output) sequence') parser.add_argument('--batch_norm', type=bool, default=False, help='batch normalization') parser.add_argument('--hidden_size', type=int, default=250, help='the size of one hidden layer') parser.add_argument('--return_sequences', type=bool, default=True, help='whether return sequences in RNN') parser.add_argument('--stateful', type=bool, default=True, help='whether stateful in RNN') parser.add_argument('--l2', type=float, default=0.001, help='L2 regularization') parser.add_argument('--dropout', type=float, default=0.60, help='dropout rate (between 0 and 1)') # model training parameter batch_size = 256 parser.add_argument('--batch_size', type=int, default=batch_size, help='size of batch sample') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate (between 0 and 1)') epochs = 5 parser.add_argument('--epochs', type=int, default=epochs, help='number of epochs') steps_per_epoch = tokens_number / (batch_size * max_token_length) / 100 parser.add_argument('--steps_per_epoch', type=int, default=steps_per_epoch, help='the number of batches for training') iterations = 20 * 200 parser.add_argument('--iterations', type=int, default=iterations, help='number of iterations') # model saving parameter model_file_train = MORPHONET_1_MODEL_FOLDER + "model_train.hdf5" parser.add_argument('--model_file_train', type=str, default=model_file_train, help='load model from a file') model_file_test = MORPHONET_1_MODEL_FOLDER + "model_test.hdf5" parser.add_argument('--model_file_test', type=str, default=model_file_test, help='load model from a file for testing') model_picture = FOLDER + "architecture.png" parser.add_argument('--model_picture', type=str, default=model_picture, help='the file for saving the model architecture') parser.add_argument('--log_every', type=int, default=1, help='print information every x iteration') parser.add_argument('--save_every', type=int, default=1, help='save_sample_image state every x epoch') args = parser.parse_args() return args
def get_args(): parser = argparse.ArgumentParser() # corpus parameter word_average_length = 3 context_window_size = 5 parser.add_argument('--context_window_size', type=int, default=context_window_size, help='the size of context window') parser.add_argument('--embedding_size', type=int, default=100, help='the size of embeddings') _, _, _, _, tokens_number, _ = zhwiki_corpus.get_statistics() parser.add_argument('--corpus_size', type=int, default=tokens_number, help='the number of word in train corpus') word_dic, _ = zhwiki_corpus.get_word_id_dictionaries() vocabulary_size = len(word_dic) parser.add_argument('--vocabulary_size', type=int, default=vocabulary_size, help='the size of vocabulary') character_dic, _ = zhwiki_corpus.get_character_id_dictionaries() character_size = len(character_dic) parser.add_argument('--character_size', type=int, default=character_size, help='the size of character') pinyin_id = cc_phonology.get_pinyin_id_dic() pinyin_size = len(pinyin_id) parser.add_argument('--pinyin_size', type=int, default=pinyin_size, help='the size of pinyin') id_initials, id_finals, id_tones = cc_phonology.get_id_other_dic() initial_size, final_size, tone_size = len(id_initials), len( id_finals), len(id_tones) parser.add_argument('--initial_size', type=int, default=initial_size, help='the size of initial') parser.add_argument('--final_size', type=int, default=final_size, help='the size of initial') parser.add_argument('--tone_size', type=int, default=tone_size, help='the size of initial') # model training parameter # batch_size = 5000 batch_size = 2000 # batch_size = 5 parser.add_argument('--batch_size', type=int, default=batch_size, help='size of batch sample') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate (between 0 and 1)') epochs = 50 # epochs = 5 parser.add_argument('--epochs', type=int, default=epochs, help='number of epochs') parser.add_argument("--gpus", type=int, default=2, help="# of GPUs to use for training") steps_per_epoch = tokens_number / (batch_size * (2 * context_window_size + 1)) # steps_per_epoch *= 5 steps_per_epoch *= 2 # steps_per_epoch = tokens_number / batch_size # steps_per_epoch = tokens_number / context_window_size / 100 # steps_per_epoch = 10 parser.add_argument('--steps_per_epoch', type=int, default=steps_per_epoch, help='the number of batches for training') iterations = 20 * 200 parser.add_argument('--iterations', type=int, default=iterations, help='number of iterations') # model saving parametsder model_file_train = SKIP_GRAM_MODEL_FOLDER + "model_train_dimension_100.hdf5" parser.add_argument('--model_file_train', type=str, default=model_file_train, help='load model from a file') model_file_test = SKIP_GRAM_MODEL_FOLDER + "model_test.hdf5" parser.add_argument('--model_file_test', type=str, default=model_file_test, help='load model from a file for testing') model_picture = FOLDER + "architecture.png" parser.add_argument('--model_picture', type=str, default=model_picture, help='the file for saving the model architecture') parser.add_argument('--log_every', type=int, default=1, help='print information every x iteration') parser.add_argument('--save_every', type=int, default=1, help='save_sample_image state every x epoch') args = parser.parse_args() return args
def skip_gram_generator(folder=ZHWIKI_FOLDER_CACHE, batch_size=5, context_window_size=5, negative_samples=5.): """Generate samples for training, testing, or validating language modeling on the Penn Treebank (PTB) dataset. # Arguments folder: the folder which contains all the corpora of training and testing and validation datasets of ptb. corpus_id: the id for denote train, test, or validation. batch_size: the size of one batch samples. context_window_size: size of context window. # Returns tuple of Numpy arrays: `(input_sequences, target_sequences)`. """ # get the corpus_id_sequence of word id corpus_id_sequence = get_corpus_id_sequence(folder) # add by Robert Steven special_ids = get_special_ids() # get the size of vocabulary character_dic, id_character_dic = get_character_id_dictionaries() character_set = set(character_dic.keys()) word_dic, id_word_dic = get_word_id_dictionaries() # get phonology knowledge character_pinyin_dic = cc_phonology.get_pinyins_of_character() fine_grained_pinyin_dic = cc_phonology.get_knowledge_of_fine_grained_pinyin( ) pinyin_id = cc_phonology.get_pinyin_id_dic() pinyin_set = set(pinyin_id.keys()) id_initials, id_finals, id_tones = cc_phonology.get_id_other_dic() # get morphology knowledge # initialize index start_idx = 0 num_classes_words = len(word_dic) # num_classes_characters = len(character_dic) # num_classes_pinyins = len(pinyin_id) # num_classes_initials = len(id_initials) # num_classes_id_finals = len(id_finals) # num_classes_tones = len(id_tones) # loop for generating batch of samples while True: # get a batch of samples input_sequence = generate_input_sequences(corpus_id_sequence, batch_size, start_idx, context_window_size) data, labels = skipgrams(sequence=input_sequence, vocabulary_size=num_classes_words, window_size=context_window_size, negative_samples=negative_samples) x = [np.array(x) for x in zip(*data)] y = np.array(labels, dtype=np.int32) # visualize_test_samples index # end_idx = start_idx + context_window_size # end_idx = start_idx + batch_size end_idx = start_idx + batch_size * (2 * context_window_size + 1) if end_idx > len(corpus_id_sequence): start_idx = end_idx - len(corpus_id_sequence) else: start_idx = end_idx # output a batch of samples yield (x, y)
from keras.callbacks import Callback import os import sys current_py_file = os.path.abspath(__file__) sys.path.append( os.path.dirname(os.path.dirname(os.path.dirname(current_py_file)))) # from elwm.datasets.ptb import to_text import morphonets.datasets.joint_evaluation as test from morphonets.datasets import zhwiki_corpus from morphonets.knowledge import cc_phonology # get dictionaries id_pinyins = cc_phonology.get_id_pinyin_dic() id_initials, id_finals, id_tones = cc_phonology.get_id_other_dic() _, id_word_dic = zhwiki_corpus.get_word_id_dictionaries() dictionaries = (id_word_dic, id_pinyins, id_initials, id_finals, id_tones) class Dashboard(Callback): """Create a dashboard for monitoring loss, accuracy, etc..., during the process of learning embedding model. # Arguments folder: the folder for saving sample picture file and statistic txt file. statistic_file: the txt file for saving loss, accuracy on training, testing and validation. model: a ANN model. watch_sample: whether watch prediction of samples during the training.
def generator(folder=ZHWIKI_FOLDER_CACHE, batch_size=5, max_token_length=5): """Generate samples for training, testing, or validating language modeling on the Penn Treebank (PTB) dataset. # Arguments folder: the folder which contains all the corpora of training and testing and validation datasets of ptb. corpus_id: the id for denote train, test, or validation. batch_size: the size of one batch samples. max_token_length: length of corpus_id_sequence for each sample. # Returns tuple of Numpy arrays: `(input_sequences, target_sequences)`. """ # get the corpus_id_sequence of word id corpus_id_sequence = get_corpus_id_sequence(folder) # add by Robert Steven special_ids = get_special_ids() # get the size of vocabulary character_dic, id_character_dic = get_character_id_dictionaries() character_set = set(character_dic.keys()) word_dic, id_word_dic = get_word_id_dictionaries() # get phonology knowledge character_pinyin_dic = cc_phonology.get_pinyins_of_character() fine_grained_pinyin_dic = cc_phonology.get_knowledge_of_fine_grained_pinyin( ) pinyin_id = cc_phonology.get_pinyin_id_dic() pinyin_set = set(pinyin_id.keys()) id_initials, id_finals, id_tones = cc_phonology.get_id_other_dic() # get morphology knowledge # initialize index start_idx = 0 max_length = max_token_length * 4 + 1 num_classes_words = len(word_dic) num_classes_characters = len(character_dic) num_classes_pinyins = len(pinyin_id) num_classes_initials = len(id_initials) num_classes_id_finals = len(id_finals) num_classes_tones = len(id_tones) # loop for generating batch of samples while True: # get a batch of samples # input_sequences = generate_batch_input_sequences( # corpus_id_sequence, start_idx, batch_size, # max_token_length) # get a batch of special samples input_sequences = generate_special_batch_input_sequences( corpus_id_sequence, start_idx, batch_size, max_token_length, special_ids) target_sequences_character, \ target_sequences_pinyin, \ target_sequences_initial, \ target_sequences_final, \ target_sequences_tone = generate_batch_target_sequences( input_sequences, character_set, character_dic, id_character_dic, word_dic, id_word_dic, pinyin_id, pinyin_set, character_pinyin_dic, fine_grained_pinyin_dic) # visualize_test_samples index end_idx = start_idx + 1 if end_idx > len(corpus_id_sequence): start_idx = end_idx - len(corpus_id_sequence) else: start_idx = end_idx input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='app', truncating='app') target_sequences_character = pad_sequences(target_sequences_character, maxlen=max_length, padding='app', truncating='app') target_sequences_pinyin = pad_sequences(target_sequences_pinyin, maxlen=max_length, padding='app', truncating='app') target_sequences_initial = pad_sequences(target_sequences_initial, maxlen=max_length, padding='app', truncating='app') target_sequences_final = pad_sequences(target_sequences_final, maxlen=max_length, padding='app', truncating='app') target_sequences_tone = pad_sequences(target_sequences_tone, maxlen=max_length, padding='app', truncating='app') input_sequences = to_tensor(input_sequences, max_length) target_sequences_character = to_categorical(target_sequences_character, max_length, num_classes_characters) target_sequences_pinyin = to_categorical(target_sequences_pinyin, max_length, num_classes_pinyins) target_sequences_initial = to_categorical(target_sequences_initial, max_length, num_classes_initials) target_sequences_final = to_categorical(target_sequences_final, max_length, num_classes_id_finals) target_sequences_tone = to_categorical(target_sequences_tone, max_length, num_classes_tones) # output a batch of samples yield (input_sequences, [ target_sequences_character, target_sequences_pinyin, target_sequences_initial, target_sequences_final, target_sequences_tone ])