def start_denotation(opts): print "Loading data..." # load training data # trees, vocab = tr.loadTrees(opts.dataset,opts.data) # chunked_lines = tr.get_chunked_lines(opts.dataset, opts.data) opts.numWords = len(tr.loadWordMap(opts.dataset)) print "vocab size: %d" % opts.numWords print "Loading word2vec vectors..." # Load pre-built word matrix using cPickle x = pickle.load(open("mr_%s.p" % opts.dataset, "rb")) W = x[0] rnn = nnet_rte.RNNRTE(opts.wvecDim, opts.outputDim, 200, opts.numWords, opts.minibatch) rnn.initParams(W) sgd = optimizer.SGD(rnn, alpha=opts.step, minibatch=opts.minibatch, optimizer=opts.optimizer) for e in range(opts.epochs): start = time.time() print "Running epoch %d" % e sgd.run_denotation(tr.get_lines(opts.dataset, opts.data)) end = time.time() print "Time per epoch : %f" % (end - start) with open(opts.outFile, "w") as fid: pickle.dump(opts, fid) pickle.dump(sgd.costt, fid) # debug if e == opts.epochs - 1: rnn.toFile(fid, True) else: rnn.toFile(fid)
def test_denotation(netFile, data, dataset): # trees, vocab = tr.loadTrees(dataset,data) assert netFile is not None, "Must give model to test" with open(netFile, "r") as fid: opts = pickle.load(fid) _ = pickle.load(fid) x = pickle.load(open("mr_%s.p" % dataset, "rb")) W = x[0] W2 = 0.01 * np.random.randn(opts.wvecDim, opts.numWords) rnn = nnet_rte.RNNRTE(opts.wvecDim, opts.outputDim, 200, opts.numWords, opts.minibatch) rnn.initParams(W) rnn.fromFile(fid) lines = tr.get_lines(opts.dataset, opts.data) m = len(lines) CHUNK_SIZE = 10000 minibatch = opts.minibatch print "Testing..." cost = correct = total = 0 for i in xrange(0, m - CHUNK_SIZE + 1, CHUNK_SIZE): # Get data as much as it's specified by minibatch num # Load&parse trees beforehand # Get data as much as it's specified by minibatch num # Parse&load tree data beforehand trees = list(tr.inputarray(lines[i : i + CHUNK_SIZE], dataset)) # map word indices to loaded trees tr.map_words_to_trees(trees, dataset) c, cor, tot = rnn.costAndGrad(trees, test=True) cost += c correct += cor total += tot if i % CHUNK_SIZE == 0: print "tested: %d" % i print "Cost %f, Correct %d/%d, Acc %f" % (cost, correct, total, correct / float(total))
def start(opts): print "Loading data..." # load training data trees, vocab = tr.loadTrees(opts.dataset, opts.data) # sick, train_parsed opts.numWords = len(tr.loadWordMap(opts.dataset)) print "Loading word2vec vectors..." # Load pre-built word matrix using cPickle # w2v_file = "/Users/pentiumx/Projects/word2vec/GoogleNews-vectors-negative300.bin" # word_vecs = process_data.load_bin_vec(w2v_file, vocab) # revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4] x = pickle.load(open("mr_%s.p" % opts.dataset, "rb")) W = x[0] W2 = 0.01 * np.random.randn(opts.wvecDim, opts.numWords) # rnn = nnet.RNN(opts.wvecDim,opts.outputDim,opts.numWords,opts.minibatch) # embeddingDim=200 for now if opts.use_denotation == 0: rnn = nnet_rte.RNNRTE(opts.wvecDim, opts.outputDim, 200, opts.numWords, opts.minibatch) rnn.initParams(W) # Use W2 for experiments with randomly initialized vectors sgd = optimizer.SGD(rnn, alpha=opts.step, minibatch=opts.minibatch, optimizer=opts.optimizer) else: """with open('models/denotation_sample.bin','r') as fid: _ = pickle.load(fid)# skip opts data __ = pickle.load(fid) x = pickle.load(open("mr_%s.p" % opts.dg_dataset, "rb")) W_dg = x[0] rnn = nnet_rte.RNNRTE(opts.wvecDim,opts.outputDim,200,opts.numWords,opts.minibatch) rnn.initParams(W, W_dg) rnn.from_file_denotation(fid) sgd = optimizer.SGD(rnn,alpha=opts.step,minibatch=opts.minibatch, optimizer=opts.optimizer)""" rnn = nnet_rte.RNNRTE(opts.wvecDim, opts.outputDim, 200, opts.numWords, opts.minibatch) rnn.initParams(W, True) x = pickle.load(open("mr_%s.p" % opts.dg_dataset, "rb")) W_dg = x[0] rnn_dg = nnet_rte.RNNRTE(opts.wvecDim, 2, 200, opts.numWords, opts.minibatch) rnn_dg.initParams(W_dg, True) sgd = optimizer.SGD(rnn, alpha=opts.step, minibatch=opts.minibatch, optimizer=opts.optimizer, model_dg=rnn_dg) for e in range(opts.epochs): start = time.time() print "Running epoch %d" % e if opts.use_denotation == 0: sgd.run(trees) else: lines = tr.get_lines(opts.dg_dataset, opts.data) sgd.run_using_denotation(trees, lines) end = time.time() print "Time per epoch : %f" % (end - start) with open(opts.outFile, "w") as fid: pickle.dump(opts, fid) pickle.dump(sgd.costt, fid) # debug if e == opts.epochs - 1: rnn.toFile(fid, True) else: rnn.toFile(fid)