Exemplo n.º 1
0
nb_run_words = options.running_words
nb_run_val = options.val_run
nb_evaluate = options.nb_evaluation
embedding_file = options.embedding_file

with file(options.coding_file, 'rb') as f:
    sparse_coding = pickle.load(f)
    # print sparse_coding.dtype

nb_vocab = options.nb_vocab
sparse_coding = sparse_coding[nb_vocab // 1000]
nb_vocab, nb_base = sparse_coding.shape
nb_base -= 1
unigram_table = get_unigram_probtable(
    nb_words=nb_vocab,
    save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab)

if embedding_file != '':
    with file('../data/wiki-wordmap-trunc300k.wp', 'rb') as f:
        wp = pickle.load(f)
    freq = wp['idx2wc']
    logger.info('Using word2vec to initialize word embeddings %s ' %
                embedding_file)
    ini_embeds = [compose_dense_repr(nb_base, nb_vocab, freq, embedding_file)]
else:
    ini_embeds = None

if options.decay:
    opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
else:
Exemplo n.º 2
0
                  help="amount of training data (number of words)")
parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
                  help="amount of training data (number of words)")
parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
                  help="decaying rate")
parser.add_option("-s", "--save", type="str", dest="save", default='',
                  help="amount of training data (number of words)")
options, args = parser.parse_args()

nb_run_words = options.running_words
nb_vocab = options.vocab_size
nb_run_val = options.val_run
nb_evaluate = options.nb_evaluation

unigram_table = get_unigram_probtable(nb_words=nb_vocab,
                                      save_path='../data/wiki-unigram-prob-size%d.pkl' %
                                                nb_vocab)
if options.decay:
    opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
else:
    opt = adam(lr=options.lr)

if options.log_file == '':
    log_file = None
else:
    log_file = options.log_file

if options.save == '':
    save_path = None
else:
    save_path = options.save
Exemplo n.º 3
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Yunchuan Chen'

from utils import get_unigram_probtable
from models import NCELangModel
from keras.optimizers import adam

NB_RUN_WORDS = 100000000
NB_VOCAB = 10000
NB_RUN_VAL = 100000
NB_EVALUATE = 5000000
SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128-lr0.01.pkl'

DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
BATCH_SIZE = 256
VAL_INTER = 1200

unigram_table = get_unigram_probtable(nb_words=NB_VOCAB)

opt = adam(lr=0.01)
model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128,
                     negprob_table=unigram_table, optimizer=opt)
model.compile()
model.train(data_file=DATA_PATH,
            save_path=SAVE_PATH,
            batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS,
            val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)