from models import NCELangModel import os, re import logging import numpy as np logging.basicConfig(level=logging.DEBUG) trn_regex = re.compile(r'\d{3}.bz2') dir_ = 'data/fake/test' train_files = [ os.path.join(dir_, f) for f in os.listdir(dir_) if trn_regex.match(f) ] X = np.loadtxt(train_files[0], dtype='int32') model = NCELangModel(vocab_size=15, nb_negative=2, embed_dims=128) ins, _ = model.prepare_input(X, 0, None) data = {model.input['idxes']: ins[0]} model.compile()
#!/usr/bin/env python # -*- coding: utf-8 -*- __author__ = 'Yunchuan Chen' from utils import get_unigram_probtable from models import NCELangModel from keras.optimizers import adam NB_RUN_WORDS = 100000000 NB_VOCAB = 10000 NB_RUN_VAL = 100000 NB_EVALUATE = 5000000 SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128-lr0.01.pkl' DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' BATCH_SIZE = 256 VAL_INTER = 1200 unigram_table = get_unigram_probtable(nb_words=NB_VOCAB) opt = adam(lr=0.01) model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128, negprob_table=unigram_table, optimizer=opt) model.compile() model.train(data_file=DATA_PATH, save_path=SAVE_PATH, batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS, val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)
logging.info('Train simple language model') model = SimpleLangModel(vocab_size=15, embed_dims=128, context_dims=128, optimizer=options.optimizer) model.compile() model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose) if options.train_nce: logging.info('Train NCE based language model') model = NCELangModel(vocab_size=15, nb_negative=2, embed_dims=128, negprob_table=negprob_table, optimizer=options.optimizer) model.compile() logging.debug('compile success') model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose) if options.train_nce1: logging.info('Train NCE based language model (1)') model = NCELangModelV1(vocab_size=15, nb_negative=6, embed_dims=128, negprob_table=negprob_table,
if options.decay: opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) else: opt = adam(lr=options.lr) if options.log_file == '': log_file = None else: log_file = options.log_file if options.save == '': save_path = None else: save_path = options.save model = NCELangModel(vocab_size=nb_vocab, nb_negative=options.negative, embed_dims=options.embed_size, context_dims=options.context_size, negprob_table=unigram_table, optimizer=opt) model.compile() model.train(data_file=DATA_PATH, save_path=save_path, batch_size=BATCH_SIZE, train_nb_words=nb_run_words, val_nb_words=nb_evaluate, train_val_nb=nb_run_val, validation_interval=options.interval, log_file=log_file)
nb_evaluate = options.nb_evaluation # unigram_table = get_unigram_probtable(nb_words=nb_vocab) unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) if options.decay: opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) else: opt = adam(lr=options.lr) if options.log_file == '': log_file = None else: log_file = options.log_file if options.save == '': save_path = None else: save_path = options.save model = NCELangModel(vocab_size=nb_vocab, nb_negative=options.negative, embed_dims=options.embed_size, context_dims=options.context_size, negprob_table=unigram_table, optimizer=opt) model.compile() model.train(data_file=DATA_PATH, save_path=save_path, batch_size=BATCH_SIZE, train_nb_words=nb_run_words, val_nb_words=nb_evaluate, train_val_nb=nb_run_val, validation_interval=options.interval, log_file=log_file)