nb_run_words = options.running_words nb_run_val = options.val_run nb_evaluate = options.nb_evaluation embedding_file = options.embedding_file with file(options.coding_file, 'rb') as f: sparse_coding = pickle.load(f) # print sparse_coding.dtype nb_vocab = options.nb_vocab sparse_coding = sparse_coding[nb_vocab // 1000] nb_vocab, nb_base = sparse_coding.shape nb_base -= 1 unigram_table = get_unigram_probtable( nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) if embedding_file != '': with file('../data/wiki-wordmap-trunc300k.wp', 'rb') as f: wp = pickle.load(f) freq = wp['idx2wc'] logger.info('Using word2vec to initialize word embeddings %s ' % embedding_file) ini_embeds = [compose_dense_repr(nb_base, nb_vocab, freq, embedding_file)] else: ini_embeds = None if options.decay: opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) else:
help="amount of training data (number of words)") parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', help="amount of training data (number of words)") parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., help="decaying rate") parser.add_option("-s", "--save", type="str", dest="save", default='', help="amount of training data (number of words)") options, args = parser.parse_args() nb_run_words = options.running_words nb_vocab = options.vocab_size nb_run_val = options.val_run nb_evaluate = options.nb_evaluation unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) if options.decay: opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) else: opt = adam(lr=options.lr) if options.log_file == '': log_file = None else: log_file = options.log_file if options.save == '': save_path = None else: save_path = options.save
#!/usr/bin/env python # -*- coding: utf-8 -*- __author__ = 'Yunchuan Chen' from utils import get_unigram_probtable from models import NCELangModel from keras.optimizers import adam NB_RUN_WORDS = 100000000 NB_VOCAB = 10000 NB_RUN_VAL = 100000 NB_EVALUATE = 5000000 SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128-lr0.01.pkl' DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' BATCH_SIZE = 256 VAL_INTER = 1200 unigram_table = get_unigram_probtable(nb_words=NB_VOCAB) opt = adam(lr=0.01) model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128, negprob_table=unigram_table, optimizer=opt) model.compile() model.train(data_file=DATA_PATH, save_path=SAVE_PATH, batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS, val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)