def to_file(self, fout): pickle.dump(self.iters, fout) pickle.dump(self.costs, fout) pickle.dump(self.expcosts, fout) if self.mom > 0: pickle.dump([as_np(self.vel[k]) for k in self.model.param_keys], fout)
def sample_continuation(s, model, order, alpha=1.0): if MODEL_TYPE == 'rnn': data = array(np.array([char_inds[w] for w in s[-1:]])).reshape(-1, 1) else: data = array(np.array([char_inds[w] for w in s[-order+1:]])).reshape(-1, 1) data = one_hot(data, model.hps.output_size) if MODEL_TYPE == 'rnn': _, probs = model.cost_and_grad(data, None, prev_h0=model.last_h) probs = np.squeeze(as_np(probs)) else: data = data.reshape((-1, data.shape[2])) _, probs = model.cost_and_grad(data, None) probs = probs.ravel() # Higher alpha -> more and more like most likely sequence probs = probs ** alpha probs = probs / sum(probs) w = np.random.choice(range(model.hps.output_size), p=probs) char = chars[w] return char
def cost_and_grad(self, data, labels, back=True, prev_h0=None): hps = self.hps T = data.shape[1] bsize = data.shape[2] # FIXME gnumpy reallocates if try and use same parameters? #us = self.us[:, 0:T, 0:bsize] #dus = self.dus[:, 0:T, 0:bsize] #hs = self.hs[:, 0:T, 0:bsize] #dhs = self.dhs[:, 0:T, 0:bsize] #probs = self.probs[:, 0:T, 0:bsize] #dprobs = self.dprobs[:, 0:T, 0:bsize] #costs = self.costs[0:T, 0:bsize] us = list() dus = list() hs = list() dhs = list() h0 = list() for k in xrange(hps.hidden_layers): us.append(list()) dus.append(list()) hs.append(list()) dhs.append(list()) h0.append(empty((hps.hidden_size, bsize))) for t in xrange(T): us[k].append(zeros((hps.hidden_size, bsize))) dus[k].append(zeros((hps.hidden_size, bsize))) hs[k].append(zeros((hps.hidden_size, bsize))) dhs[k].append(zeros((hps.hidden_size, bsize))) probs = list() for t in xrange(T): probs.append(zeros((hps.output_size, bsize))) costs = np.zeros((T, bsize)) if prev_h0 is not None: h0 = prev_h0 else: for k in xrange(hps.hidden_layers): h0[k] = tile(self.params['h0'][:, k].reshape(-1, 1), bsize) bih = self.params['bih'] Wih = self.params['Wih'] Whh = self.params['Whh'] bhh = self.params['bhh'] Who = self.params['Who'] bho = self.params['bho'] # Forward prop for t in xrange(T): for k in xrange(hps.hidden_layers): if t == 0: hprev = h0[k] else: hprev = hs[k][t-1] if k == 0: us[k][t] = mult(Wih, data[:, t, :]) + bih else: us[k][t] = mult(self.params['Wh%d' % k], hs[k-1][t]) if k == hps.recurrent_layer - 1: us[k][t] += mult(Whh, hprev) + bhh # Clip maximum activation mask = us[k][t] < hps.max_act us[k][t] = us[k][t] * mask + hps.max_act * (1 - mask) elif k != 0: us[k][t] += self.params['bh%d' % k] hs[k][t] = self.nl(us[k][t]) probs[t] = softmax(mult(Who, hs[-1][t]) + bho) self.last_h = list() for k in xrange(hps.hidden_layers): self.last_h.append(hs[k][-1]) if labels is None: return None, probs probs_neg_log = list() dprobs = list() for t in xrange(T): probs_neg_log.append(as_np(-1 * log(probs[t]))) dprobs.append(as_np(probs[t].copy())) for k in xrange(bsize): for t in xrange(len(labels[k])): costs[t, k] = probs_neg_log[t][labels[k][t], k] dprobs[t][labels[k][t], k] -= 1 for t in xrange(T): dprobs[t] = array(dprobs[t]) # NOTE Summing costs over time # NOTE FIXME Dividing by T to get better sense if objective # is decreasing, remove for grad checking cost = costs.sum() / bsize / float(T) if not back: return cost, probs # Backprop for k in self.grads: self.grads[k][:] = 0 for t in reversed(xrange(T)): self.grads['bho'] += dprobs[t][:, :].sum(axis=-1).reshape((-1, 1)) / bsize self.grads['Who'] += mult(dprobs[t], hs[-1][t].T) / bsize for k in reversed(xrange(hps.hidden_layers)): if k == hps.hidden_layers - 1: dhs[k][t] += mult(Who.T, dprobs[t]) else: dhs[k][t] += mult(self.params['Wh%d' % (k+1)].T, dhs[k+1][t]) dus[k][t] += get_nl_grad(self.hps.nl, us[k][t]) * dhs[k][t] if k > 0: self.grads['Wh%d' % k] += mult(dus[k][t], hs[k-1][t].T) / bsize self.grads['bh%d' % k] += dus[k][t].sum(axis=-1).reshape((-1, 1)) / bsize if k == hps.recurrent_layer - 1: if t == 0: hprev = h0[k] self.grads['h0'][:, k] = mult(Whh.T, dus[k][t]).sum(axis=-1) / bsize else: hprev = hs[k][t-1] dhs[k][t-1] = mult(Whh.T, dus[k][t]) self.grads['Whh'] += mult(dus[k][t], hprev.T) / bsize self.grads['bhh'] += dus[k][t].sum(axis=-1).reshape((-1, 1)) / bsize self.grads['Wih'] += mult(dus[0][t], data[:, t, :].T) / bsize self.grads['bih'] += dus[0][t].sum(axis=-1).reshape((-1, 1)) / bsize return cost, self.grads
def cost_and_grad(self, data, labels, back=True, prev_h0=None): hps = self.hps T = data.shape[1] bsize = data.shape[2] # FIXME gnumpy reallocates if try and use same parameters? #us = self.us[:, 0:T, 0:bsize] #dus = self.dus[:, 0:T, 0:bsize] #hs = self.hs[:, 0:T, 0:bsize] #dhs = self.dhs[:, 0:T, 0:bsize] #probs = self.probs[:, 0:T, 0:bsize] #dprobs = self.dprobs[:, 0:T, 0:bsize] #costs = self.costs[0:T, 0:bsize] us = list() dus = list() hs = list() dhs = list() h0 = list() for k in xrange(hps.hidden_layers): us.append(list()) dus.append(list()) hs.append(list()) dhs.append(list()) h0.append(empty((hps.hidden_size, bsize))) for t in xrange(T): us[k].append(zeros((hps.hidden_size, bsize))) dus[k].append(zeros((hps.hidden_size, bsize))) hs[k].append(zeros((hps.hidden_size, bsize))) dhs[k].append(zeros((hps.hidden_size, bsize))) probs = list() for t in xrange(T): probs.append(zeros((hps.output_size, bsize))) costs = np.zeros((T, bsize)) if prev_h0 is not None: h0 = prev_h0 else: for k in xrange(hps.hidden_layers): h0[k] = tile(self.params['h0'][:, k].reshape(-1, 1), bsize) bih = self.params['bih'] Wih = self.params['Wih'] Whh = self.params['Whh'] bhh = self.params['bhh'] Who = self.params['Who'] bho = self.params['bho'] # Forward prop for t in xrange(T): for k in xrange(hps.hidden_layers): if t == 0: hprev = h0[k] else: hprev = hs[k][t - 1] if k == 0: us[k][t] = mult(Wih, data[:, t, :]) + bih else: us[k][t] = mult(self.params['Wh%d' % k], hs[k - 1][t]) if k == hps.recurrent_layer - 1: us[k][t] += mult(Whh, hprev) + bhh # Clip maximum activation mask = us[k][t] < hps.max_act us[k][t] = us[k][t] * mask + hps.max_act * (1 - mask) elif k != 0: us[k][t] += self.params['bh%d' % k] hs[k][t] = self.nl(us[k][t]) probs[t] = softmax(mult(Who, hs[-1][t]) + bho) self.last_h = list() for k in xrange(hps.hidden_layers): self.last_h.append(hs[k][-1]) if labels is None: return None, probs probs_neg_log = list() dprobs = list() for t in xrange(T): probs_neg_log.append(as_np(-1 * log(probs[t]))) dprobs.append(as_np(probs[t].copy())) for k in xrange(bsize): for t in xrange(len(labels[k])): costs[t, k] = probs_neg_log[t][labels[k][t], k] dprobs[t][labels[k][t], k] -= 1 for t in xrange(T): dprobs[t] = array(dprobs[t]) # NOTE Summing costs over time # NOTE FIXME Dividing by T to get better sense if objective # is decreasing, remove for grad checking cost = costs.sum() / bsize / float(T) if not back: return cost, probs # Backprop for k in self.grads: self.grads[k][:] = 0 for t in reversed(xrange(T)): self.grads['bho'] += dprobs[t][:, :].sum(axis=-1).reshape( (-1, 1)) / bsize self.grads['Who'] += mult(dprobs[t], hs[-1][t].T) / bsize for k in reversed(xrange(hps.hidden_layers)): if k == hps.hidden_layers - 1: dhs[k][t] += mult(Who.T, dprobs[t]) else: dhs[k][t] += mult(self.params['Wh%d' % (k + 1)].T, dhs[k + 1][t]) dus[k][t] += get_nl_grad(self.hps.nl, us[k][t]) * dhs[k][t] if k > 0: self.grads['Wh%d' % k] += mult(dus[k][t], hs[k - 1][t].T) / bsize self.grads['bh%d' % k] += dus[k][t].sum(axis=-1).reshape( (-1, 1)) / bsize if k == hps.recurrent_layer - 1: if t == 0: hprev = h0[k] self.grads['h0'][:, k] = mult( Whh.T, dus[k][t]).sum(axis=-1) / bsize else: hprev = hs[k][t - 1] dhs[k][t - 1] = mult(Whh.T, dus[k][t]) self.grads['Whh'] += mult(dus[k][t], hprev.T) / bsize self.grads['bhh'] += dus[k][t].sum(axis=-1).reshape( (-1, 1)) / bsize self.grads['Wih'] += mult(dus[0][t], data[:, t, :].T) / bsize self.grads['bih'] += dus[0][t].sum(axis=-1).reshape( (-1, 1)) / bsize return cost, self.grads
def cost_and_grad(self, data, labels, back=True): hps = self.hps grads = self.grads # May not be full batch size if at end of dataset bsize = data.shape[-1] p = ParamStruct(**self.params) # Forward prop acts = list() acts.append(self.nl(mult(p.Wih, data) + p.bih)) for k in xrange(hps.hidden_layers - 1): W = self.params['W%d' % (k + 1)] b = self.params['b%d' % (k + 1)] acts.append(self.nl(mult(W, acts[-1]) + b)) y = mult(p.Who, acts[-1]) + p.bho probs = softmax(y) if labels is None: return None, probs # NOTE For more precision if necessary convert to nparray early cost_array = np.empty(bsize, dtype=np.float64) # Speed things up by doing assignments off gpu neg_log_prob = -1 * np.log(as_np(probs)) for k in xrange(bsize): cost_array[k] = neg_log_prob[labels[k], k] cost = cost_array.sum() / bsize if not back: return cost, probs # Backprop for k in self.grads: self.grads[k][:] = 0 # Do assignments off GPU to speed things up dLdy = as_np(probs) # NOTE This changes probs for k in xrange(bsize): dLdy[labels[k], k] -= 1 dLdy = array(dLdy) grads['bho'] = dLdy.sum(axis=1).reshape((-1, 1)) grads['Who'] = mult(dLdy, acts[-1].T) Ws = [p.Wih] + [ self.params['W%d' % (k + 1)] for k in xrange(hps.hidden_layers - 1) ] + [p.Who] deltas = [dLdy] for k in reversed(xrange(hps.hidden_layers - 1)): delta = get_nl_grad(self.hps.nl, acts[k + 1]) * mult( Ws[k + 2].T, deltas[-1]) deltas.append(delta) grads['b%d' % (k + 1)] = delta.sum(axis=1).reshape((-1, 1)) grads['W%d' % (k + 1)] = mult(delta, acts[k].T) delta = get_nl_grad(self.hps.nl, acts[0]) * mult(Ws[1].T, deltas[-1]) grads['bih'] = delta.sum(axis=1).reshape((-1, 1)) grads['Wih'] = mult(delta, data.T) # Normalize for k in self.grads: self.grads[k] /= bsize return cost, self.grads
def cost_and_grad(self, data, labels, back=True): hps = self.hps grads = self.grads # May not be full batch size if at end of dataset bsize = data.shape[-1] p = ParamStruct(**self.params) # Forward prop acts = list() acts.append(self.nl(mult(p.Wih, data) + p.bih)) for k in xrange(hps.hidden_layers - 1): W = self.params['W%d' % (k+1)] b = self.params['b%d' % (k+1)] acts.append(self.nl(mult(W, acts[-1]) + b)) y = mult(p.Who, acts[-1]) + p.bho probs = softmax(y) if labels is None: return None, probs # NOTE For more precision if necessary convert to nparray early cost_array = np.empty(bsize, dtype=np.float64) # Speed things up by doing assignments off gpu neg_log_prob = -1 * np.log(as_np(probs)) for k in xrange(bsize): cost_array[k] = neg_log_prob[labels[k], k] cost = cost_array.sum() / bsize if not back: return cost, probs # Backprop for k in self.grads: self.grads[k][:] = 0 # Do assignments off GPU to speed things up dLdy = as_np(probs) # NOTE This changes probs for k in xrange(bsize): dLdy[labels[k], k] -= 1 dLdy = array(dLdy) grads['bho'] = dLdy.sum(axis=1).reshape((-1, 1)) grads['Who'] = mult(dLdy, acts[-1].T) Ws = [p.Wih] + [self.params['W%d' % (k+1)] for k in xrange(hps.hidden_layers - 1)] + [p.Who] deltas = [dLdy] for k in reversed(xrange(hps.hidden_layers - 1)): delta = get_nl_grad(self.hps.nl, acts[k+1]) * mult(Ws[k + 2].T, deltas[-1]) deltas.append(delta) grads['b%d' % (k+1)] = delta.sum(axis=1).reshape((-1, 1)) grads['W%d' % (k+1)] = mult(delta, acts[k].T) delta = get_nl_grad(self.hps.nl, acts[0]) * mult(Ws[1].T, deltas[-1]) grads['bih'] = delta.sum(axis=1).reshape((-1, 1)) grads['Wih'] = mult(delta, data.T) # Normalize for k in self.grads: self.grads[k] /= bsize return cost, self.grads
def to_file(self, fout): logger.info('Saving state') pickle.dump([as_np(self.params[k]) for k in self.param_keys], fout) self.opt.to_file(fout)
model_hps = NCLMHyperparams() opt_hps = OptimizerHyperparams() model_hps.set_from_dict(cfg) opt_hps.set_from_dict(cfg) cfg = CfgStruct(**cfg) # Load dataset #dataset = BrownCorpus(model_hps.context_size, model_hps.batch_size, subset='dev') dataset = CharCorpus(model_hps.context_size, model_hps.batch_size, subset='dev') # Construct network model = NCLM(dataset, model_hps, opt_hps, train=False, opt='nag') # Load parameters with open(pjoin(os.path.dirname(args.cfg_file), 'params.pk'), 'rb') as fin: model.from_file(fin) embeddings = as_np(model.params['C']).T # NOTE Normalizing embeddings = embeddings / np.sqrt(np.sum(np.square(embeddings), axis=1)).reshape((-1, 1)) tree = KDTree(embeddings, leaf_size=30, metric='euclidean') #query = embeddings[model.dset.word_inds['king'], :] query = embeddings[model.dset.char_inds['e'], :] # PARAM dists, inds = tree.query(query, k=10) for dist, ind in zip(dists.ravel(), inds.ravel()): #print model.dset.words[ind], dist print model.dset.chars[ind], dist
else: params_file = pjoin(os.path.dirname(args.cfg_file), "params.pk") logger.info("Loading params from %s" % params_file) with open(params_file, "rb") as fin: model.from_file(fin) likelihoods = None labels = None it = 0 while dataset.data_left(): cost, probs = model.run(back=False) if MODEL_TYPE == "rnn": llt = np.zeros((probs[0].shape[0], len(probs), probs[0].shape[1])) for t in xrange(len(probs)): llt[:, t, :] = as_np(probs[t]) # Deal with sequences in batch being of different lengths ll = llt[:, 0 : len(model.dset.batch_labels[0]), 0].reshape((llt.shape[0], -1)) j = 1 for sl in model.dset.batch_labels[1:]: ll = np.hstack((ll, llt[:, 0 : len(sl), j].reshape(llt.shape[0], -1))) j += 1 y = np.array([i for sl in model.dset.batch_labels for i in sl]) else: ll = as_np(probs) y = as_np(model.dset.batch_labels) if likelihoods is None: likelihoods = ll
else: params_file = pjoin(os.path.dirname(args.cfg_file), 'params.pk') logger.info('Loading params from %s' % params_file) with open(params_file, 'rb') as fin: model.from_file(fin) likelihoods = None labels = None it = 0 while dataset.data_left(): cost, probs = model.run(back=False) if MODEL_TYPE == 'rnn': llt = np.zeros((probs[0].shape[0], len(probs), probs[0].shape[1])) for t in xrange(len(probs)): llt[:, t, :] = as_np(probs[t]) # Deal with sequences in batch being of different lengths ll = llt[:, 0:len(model.dset.batch_labels[0]), 0].reshape( (llt.shape[0], -1)) j = 1 for sl in model.dset.batch_labels[1:]: ll = np.hstack((ll, llt[:, 0:len(sl), j].reshape(llt.shape[0], -1))) j += 1 y = np.array([i for sl in model.dset.batch_labels for i in sl]) else: ll = as_np(probs) y = as_np(model.dset.batch_labels)
args = parser.parse_args() cfg = load_config(args.cfg_file) model_hps = NCLMHyperparams() opt_hps = OptimizerHyperparams() model_hps.set_from_dict(cfg) opt_hps.set_from_dict(cfg) cfg = CfgStruct(**cfg) # Load dataset #dataset = BrownCorpus(model_hps.context_size, model_hps.batch_size, subset='dev') dataset = CharCorpus(model_hps.context_size, model_hps.batch_size, subset='dev') # Construct network model = NCLM(dataset, model_hps, opt_hps, train=False, opt='nag') # Load parameters with open(pjoin(os.path.dirname(args.cfg_file), 'params.pk'), 'rb') as fin: model.from_file(fin) embeddings = as_np(model.params['C']).T # NOTE Normalizing embeddings = embeddings / np.sqrt(np.sum(np.square(embeddings), axis=1)).reshape((-1, 1)) tree = KDTree(embeddings, leaf_size=30, metric='euclidean') #query = embeddings[model.dset.word_inds['king'], :] query = embeddings[model.dset.char_inds['e'], :] # PARAM dists, inds = tree.query(query, k=10) for dist, ind in zip(dists.ravel(), inds.ravel()): #print model.dset.words[ind], dist print model.dset.chars[ind], dist