def decode_lstm(load_from='lstm_model.npz'): npz_archive = numpy.load(load_from) model_options = npz_archive['model_options'] ptb_data = npz_archive['ptb_data'] lstm_lm = LSTM_LM(model_options['dim_proj'], model_options['y_dim'], ptb_data.dictionary, SEED) print('Reloading params from %s' % save_to) load_params(load_from, lstm_lm.params) # Update the tparams with the new values zipp(lstm_lm.params, lstm_lm.tparams) print("model options", model_options) # Create the shared variables for the model lstm_lm.build_decode() test_sentences = ['with the', 'the cat', 'when the'] test_sentences = [ ptb_data.dictionary.read_sentence(s) for s in test_sentences ] test_sentences, test_mask = pad_and_mask(test_sentences) start_time = time.time() output = lstm_lm.f_decode(test_sentences, test_mask, model_options['maxlen']) end_time = time.time() print('Decoding took %.1fs' % (end_time - start_time))
def pred_cost(self, data, iterator, verbose=False): """ Probabilities for new examples from a trained model data : The complete dataset. A list of lists. Each nested list is a sample iterator : A list of lists. Each nested list is a batch with idxs to the sample in data """ # Total samples n_samples = len(data) running_cost = [] samples_seen = [] n_done = 0 # valid_index is a list containing the IDXs of samples for a batch for _, valid_index in iterator: x, mask, _ = pad_and_mask([data[t] for t in valid_index]) # Accumulate running cost samples_seen.append(len(valid_index)) running_cost.append(self.f_cost(x, mask)) n_done += len(valid_index) if verbose: print("%d/%d samples classified" % (n_done, n_samples)) return sum([ samples_seen[i] * running_cost[i] for i in range(len(samples_seen)) ]) / sum(samples_seen)
def decode_lstm( load_from='lstm_model.npz' ): npz_archive = numpy.load(load_from) model_options = npz_archive['model_options'] ptb_data = npz_archive['ptb_data'] lstm_lm = LSTM_LM(model_options['dim_proj'], model_options['y_dim'], ptb_data.dictionary, SEED) print('Reloading params from %s' % save_to) load_params(load_from, lstm_lm.params) # Update the tparams with the new values zipp(lstm_lm.params, lstm_lm.tparams) print("model options", model_options) # Create the shared variables for the model lstm_lm.build_decode() test_sentences = ['with the', 'the cat', 'when the'] test_sentences = [ptb_data.dictionary.read_sentence(s) for s in test_sentences] test_sentences, test_mask = pad_and_mask(test_sentences) start_time = time.time() output = lstm_lm.f_decode(test_sentences, test_mask, model_options['maxlen']) end_time = time.time() print('Decoding took %.1fs' % (end_time - start_time))
def pred_cost(self, data, iterator, verbose=False): """ Probabilities for new examples from a trained model data : The complete dataset. A list of lists. Each nested list is a sample iterator : A list of lists. Each nested list is a batch with idxs to the sample in data """ # Total samples n_samples = len(data) running_cost = [] samples_seen = [] n_done = 0 # valid_index is a list containing the IDXs of samples for a batch for _, valid_index in iterator: x, mask, _ = pad_and_mask([data[t] for t in valid_index]) # Accumulate running cost samples_seen.append(len(valid_index)) running_cost.append(self.f_cost(x, mask)) n_done += len(valid_index) if verbose: print("%d/%d samples classified" % (n_done, n_samples)) return sum([samples_seen[i] * running_cost[i] for i in range(len(samples_seen))]) / sum(samples_seen)
def pred_error(self, data, iterator, verbose=False): """ Errors for samples for a trained model """ valid_err = 0 for _, valid_index in iterator: x, mask, y = pad_and_mask([data[0][t] for t in valid_index], numpy.array(data[1])[valid_index], maxlen=None) preds = self.f_pred(x, mask) targets = numpy.array(data[1])[valid_index] valid_err += (preds == targets).sum() valid_err = 1. - numpy_floatX(valid_err) / len(data[0]) return valid_err
def pred_probs(self, data, iterator, verbose=False): """ Probabilities for new examples from a trained model """ n_samples = len(data[0]) probs = numpy.zeros((n_samples, 2)).astype(theano.config.floatX) n_done = 0 for _, valid_index in iterator: x, mask, y = pad_and_mask([data[0][t] for t in valid_index], numpy.array(data[1])[valid_index], maxlen=None) pred_probs = self.f_pred_prob(x, mask) probs[valid_index, :] = pred_probs n_done += len(valid_index) if verbose: print("%d/%d samples classified" % (n_done, n_samples)) return probs
def train_lstm( dim_proj=650, patience=10, max_epochs=5000, disp_freq=10, decay_c=0., lrate=0.0001, n_words=10000, optimizer=adadelta, encoder='lstm', save_to='lstm_model.npz', load_from='lstm_model.96.npz', valid_freq=370, save_freq=1110, maxlen=35, batch_size=20, valid_batch_size=64, dataset='../../data/simple-examples/data', noise_std=0., use_dropout=True, reload_model=False, ): model_options = locals().copy() print("model options", model_options) print("... Loading data") ptb_data = ptb.PTB(dataset, n_words=n_words, emb_dim=model_options['dim_proj']) train, valid, test = ptb_data.load_data() print("... Done loading data") ydim = ptb_data.dictionary.n_words model_options['ydim'] = ydim print('Building model') # Create the initial parameters for the model lstm_lm = LSTM_LM(model_options['dim_proj'], ydim, ptb_data.dictionary, SEED) if reload_model: print('Reloading params from %s' % save_to) load_params(load_from, lstm_lm.params) # Update the tparams with the new values zipp(lstm_lm.params, lstm_lm.tparams) # Create the shared variables for the model (use_noise, x, mask, cost) = lstm_lm.build_model() if decay_c > 0.: cost += weight_decay(cost, lstm_lm.tparams['U'], decay_c) f_cost = theano.function([x, mask], cost, name='f_cost') grads = theano.grad(cost, wrt=list(lstm_lm.tparams.values())) f_grad = theano.function([x, mask], grads, name='f_grad') lr = T.scalar('lr') f_grad_shared, f_update = optimizer(lr, lstm_lm.tparams, grads, cost, x, mask) # Keep a few sentences to decode, to see how training is performing decode_use_noise, _, _, _ = lstm_lm.build_decode() decode_use_noise.set_value(1.) decode_sentences = ['with the', 'the cat', 'when the'] decode_sentences = [ptb_data.dictionary.read_sentence(s) for s in decode_sentences] decode_sentences, decode_mask, _ = pad_and_mask(decode_sentences) print('Optimization') kf_valid = get_minibatches_idx(len(valid), valid_batch_size) kf_test = get_minibatches_idx(len(test), valid_batch_size) print('%d train examples' % len(train)) print('%d valid examples' % len(valid)) print('%d test examples' % len(test)) history_errs = [] best_p = None bad_count = 0 if valid_freq == -1: valid_freq = len(train) // batch_size if save_freq == -1: save_freq = len(train) // batch_size uidx = 0 # The number of updates done estop = False # Early stop start_time = time.time() try: for eidx in range(max_epochs): n_samples = 0 # Get shuffled index for the training set kf = get_minibatches_idx(len(train), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(1.) # Select the random examples in this minibatch x = [train[t] for t in train_index] # Convert to shape (minibatch maxlen, n samples) # Truncated backprop x, mask, _ = pad_and_mask(x, maxlen=maxlen) n_samples += x.shape[1] cost = f_grad_shared(x, mask) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print('bad cost detected: ', cost) return 1., 1., 1. if numpy.mod(uidx, disp_freq) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost) if save_to and numpy.mod(uidx, save_freq) == 0: print('Saving...') if best_p is not None: lstm_lm.params = best_p else: lstm_lm.params = unzip(lstm_lm.tparams) numpy.savez(save_to, history_errs=history_errs, **lstm_lm.params) pickle.dump(model_options, open('%s.pkl' % save_to, 'wb'), -1) print('Done') if numpy.mod(uidx, valid_freq) == 0: use_noise.set_value(0.) valid_cost = lstm_lm.pred_cost(valid, kf_valid) test_cost = lstm_lm.pred_cost(test, kf_test) history_errs.append([valid_cost, test_cost]) if (best_p is None or valid_cost <= numpy.array(history_errs)[:, 0].min()): best_p = unzip(lstm_lm.tparams) bad_counter = 0 print(('Valid ', valid_cost, 'Test ', test_cost)) print("Some sentences.. ") print(ptb_data.dictionary.idx_to_words(lstm_lm.f_decode(decode_sentences, decode_mask, model_options['maxlen']))) if (len(history_errs) > patience and valid_cost >= numpy.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: print('Early Stop') estop = True break print('Seen %d samples' % n_samples) if estop: break except KeyboardInterrupt: print('Training Interrupted') end_time = time.time() if best_p is not None: zipp(best_p, lstm_lm.tparams) else: best_p = unzip(lstm_lm.tparams) use_noise.set_value(0.) # Note that the training dataset is sorted by length. # This is for faster decoding, since padding will create smaller batch matrices kf_train_sorted = get_minibatches_idx(len(train), batch_size) train_cost = lstm_lm.pred_cost(train, kf_train_sorted) valid_cost = lstm_lm.pred_cost(valid, kf_valid) test_cost = lstm_lm.pred_cost(test, kf_test) print('Train ', train_cost, 'Valid ', valid_cost, 'Test ', test_cost) if save_to: numpy.savez(save_to, train_cost=train_cost, valid_cost=valid_cost, test_cost=test_cost, history_errs=history_errs, **best_p) print('The code run for %d epochs, with %f secs/epoch' % ((eidx + 1), ((end_time - start_time) / (1. * (eidx + 1))))) print('Training took %.1fs' % (end_time - start_time)) return train_cost, valid_cost, test_cost
def train_lstm( dim_proj=128, patience=10, max_epochs=5000, disp_freq=10, decay_c=0., lrate=0.0001, n_words=10000, optimizer=adadelta, encoder='lstm', save_to='lstm_model.npz', valid_freq=370, save_freq=1110, maxlen=100, batch_size=16, valid_batch_size=64, dataset='../../data/aclImdb', noise_std=0., use_dropout=True, reload_model=None, test_size=-1 ): model_options = locals().copy() print("model options", model_options) imdb_data = imdb.IMDB(dataset, n_words=n_words, emb_dim=model_options['dim_proj']) train, valid, test = imdb_data.load_data(valid_portion=0.05, maxlen=maxlen) if test_size > 0: # Random shuffle of the test set idx = numpy.arange(len(test[0])) numpy.random.shuffle(idx) idx = idx[:test_size] test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) ydim = numpy.max(train[1]) + 1 model_options['ydim'] = ydim print('Building model') # Create the initial parameters for the model lstm_lm = LSTM_LM(model_options['dim_proj'], ydim, imdb_data.dictionary, SEED) if reload_model: load_params('lstm_model.npz', lstm_lm.params) # Update the tparams with the new values zipp(lstm_lm.params, lstm_lm.tparams) # Create the shared variables for the model (use_noise, x, mask, y, cost) = lstm_lm.build_model() if decay_c > 0.: cost += weight_decay(cost, lstm_lm.tparams['U'], decay_c) f_cost = theano.function([x, mask, y], cost, name='f_cost') grads = theano.grad(cost, wrt=list(lstm_lm.tparams.values())) f_grad = theano.function([x, mask, y], grads, name='f_grad') lr = T.scalar('lr') f_grad_shared, f_update = optimizer(lr, lstm_lm.tparams, grads, cost, x, mask, y) print('Optimization') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) print('%d train examples' % len(train[0])) print('%d valid examples' % len(valid[0])) print('%d test examples' % len(test[0])) history_errs = [] best_p = None bad_count = 0 if valid_freq == -1: valid_freq = len(train[0]) // batch_size if save_freq == -1: save_freq = len(train[0]) // batch_size uidx = 0 # The number of updates done estop = False # Early stop start_time = time.time() try: for eidx in range(max_epochs): n_samples = 0 # Get shuffled index for the training set kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(1.) # Select the random examples in this minibatch y = [train[1][t] for t in train_index] x = [train[0][t] for t in train_index] # Convert to shape (minibatch maxlen, n samples) x, mask, y = pad_and_mask(x, y) n_samples += x.shape[1] cost = f_grad_shared(x, mask, y) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print('bad cost detected: ', cost) return 1., 1., 1. if numpy.mod(uidx, disp_freq) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost) if save_to and numpy.mod(uidx, save_freq) == 0: print('Saving...') if best_p is not None: lstm_lm.params = best_p else: lstm_lm.params = unzip(lstm_lm.tparams) numpy.savez(save_to, history_errs=history_errs, **lstm_lm.params) pickle.dump(model_options, open('%s.pkl' % save_to, 'wb'), -1) print('Done') if numpy.mod(uidx, valid_freq) == 0: use_noise.set_value(0.) train_err = lstm_lm.pred_error(train, kf) valid_err = lstm_lm.pred_error(valid, kf_valid) test_err = lstm_lm.pred_error(test, kf_test) history_errs.append([valid_err, test_err]) if (best_p is None or valid_err <= numpy.array(history_errs)[:, 0].min()): best_p = unzip(lstm_lm.tparams) bad_counter = 0 print(('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err)) if (len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: print('Early Stop') estop = True break print('Seen %d samples' % n_samples) if estop: break except KeyboardInterrupt: print('Training Interrupted') end_time = time.time() if best_p is not None: zipp(best_p, lstm_lm.tparams) else: best_p = unzip(lstm_lm.tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = lstm_lm.pred_error(train, kf_train_sorted) valid_err = lstm_lm.pred_error(valid, kf_valid) test_err = lstm_lm.pred_error(test, kf_test) print('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err) if save_to: numpy.savez(save_to, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print('The code run for %d epochs, with %f epochs/sec' % ((eidx + 1), ((end_time - start_time) / (1. * (eidx + 1))))) print('Training took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err