def main(args): # create model destination if not os.path.exists(args.model): os.makedirs(args.model) vocab_dest = os.path.join(args.model, 'vocab') print >> sys.stderr, 'Copying vocabulary to {}'.format(vocab_dest) shutil.copy(args.vocab, vocab_dest) # determine vocabulary size print >> sys.stderr, 'Loading vocabulary from {}'.format(args.vocab) voc = vocab.Vocab.load(args.vocab) vocab_size = voc.size() if args.vocab_size is not None: vocab_size = min(vocab_size, args.vocab_size) print >> sys.stderr, 'Vocabulary size: {}'.format(vocab_size) # create sequence-to-sequence model encoder = rnn.Rnn(emb_dim=args.emb, vocab_size=vocab_size, layers=args.hidden, suppress_output=True, lstm=args.lstm) decoder = rnn.Rnn(emb_dim=args.emb, vocab_size=vocab_size, layers=args.hidden, suppress_output=False, lstm=args.lstm) s2s = seq2seq.Seq2Seq(encoder, decoder) # load corpus print >> sys.stderr, 'Loading training data from {}'.format(args.data) c = corpus.load_corpus(args.data, max_len=args.max_len) # create batches print >> sys.stderr, 'Creating batches...' batches = corpus.create_batches(c, batch_size=args.batch, shuffle=not args.no_shuffle, max_vocab_size=vocab_size) # train print >> sys.stderr, 'Training started.' optimizer = util.list2optimizer(args.optim) util.train(s2s, batches, optimizer, args.model, max_epoch=None, gpu=args.gpu, save_every=args.save_every, get_status=_get_status)
def main(args): # create model destination if not os.path.exists(args.model): os.makedirs(args.model) vocab_dest = os.path.join(args.model, "vocab") print >>sys.stderr, "Copying vocabulary to {}".format(vocab_dest) shutil.copy(args.vocab, vocab_dest) # determine vocabulary size print >>sys.stderr, "Loading vocabulary from {}".format(args.vocab) voc = vocab.Vocab.load(args.vocab) vocab_size = voc.size() if args.vocab_size is not None: vocab_size = min(vocab_size, args.vocab_size) print >>sys.stderr, "Vocabulary size: {}".format(vocab_size) # create sequence-to-sequence model encoder = rnn.Rnn(emb_dim=args.emb, vocab_size=vocab_size, layers=args.hidden, suppress_output=True, lstm=args.lstm) decoder = rnn.Rnn( emb_dim=args.emb, vocab_size=vocab_size, layers=args.hidden, suppress_output=False, lstm=args.lstm ) s2s = seq2seq.Seq2Seq(encoder, decoder) # load corpus print >>sys.stderr, "Loading training data from {}".format(args.data) c = corpus.load_corpus(args.data, max_len=args.max_len) # create batches print >>sys.stderr, "Creating batches..." batches = corpus.create_batches(c, batch_size=args.batch, shuffle=not args.no_shuffle, max_vocab_size=vocab_size) # train print >>sys.stderr, "Training started." optimizer = util.list2optimizer(args.optim) util.train( s2s, batches, optimizer, args.model, max_epoch=None, gpu=args.gpu, save_every=args.save_every, get_status=_get_status, )
def test_load_corpus(corpus_path): cps = corpus.load_corpus(corpus_path) batches = corpus.create_batches(cps, 3) assert len(batches) == 2