def predict_and_score(self, eval_instances, random=False, verbosity=0): predictions = [] scores = [] batches = iterators.iter_batches(eval_instances, self.options.listener_eval_batch_size) num_batches = (len(eval_instances) - 1) // self.options.listener_eval_batch_size + 1 if self.options.verbosity + verbosity >= 2: print('Testing') progress.start_task('Eval batch', num_batches) for batch_num, batch in enumerate(batches): progress.progress(batch_num) batch = list(batch) xs, (y,) = self._data_to_arrays(batch, test=True) probs = self.model.predict(xs) if random: indices = sample(probs) predictions.extend(indices) else: predictions.extend(probs.argmax(axis=1)) scores_arr = np.log(probs[np.arange(len(batch)), y]) scores.extend(scores_arr.tolist()) progress.end_task() if self.options.verbosity >= 9: print('%s %ss:') % (self.id, 'sample' if random else 'prediction') for inst, prediction in zip(eval_instances, predictions): print('%s -> %s' % (repr(inst.input), repr(prediction))) return predictions, scores
def predict_and_score(self, eval_instances, random=False, verbosity=0): predictions = [] scores = [] batches = iterators.iter_batches(eval_instances, self.options.listener_eval_batch_size) num_batches = (len(eval_instances) - 1) // self.options.listener_eval_batch_size + 1 if self.options.verbosity + verbosity >= 2: print('Testing') progress.start_task('Eval batch', num_batches) for batch_num, batch in enumerate(batches): progress.progress(batch_num) batch = list(batch) xs, (y, ) = self._data_to_arrays(batch, test=True) probs = self.model.predict(xs) if random: indices = sample(probs) predictions.extend(indices) else: predictions.extend(probs.argmax(axis=1)) scores_arr = np.log(probs[np.arange(len(batch)), y]) scores.extend(scores_arr.tolist()) progress.end_task() if self.options.verbosity >= 9: print('%s %ss:') % (self.id, 'sample' if random else 'prediction') for inst, prediction in zip(eval_instances, predictions): print('%s -> %s' % (repr(inst.input), repr(prediction))) return predictions, scores
def predict(self, eval_instances, random=False, verbosity=0): result = [] batches = iterators.iter_batches(eval_instances, self.options.speaker_eval_batch_size) num_batches = (len(eval_instances) - 1) // self.options.speaker_eval_batch_size + 1 eos_index = self.seq_vec.vectorize(['</s>'])[0] if self.options.verbosity + verbosity >= 2: print('Predicting') if self.options.verbosity + verbosity >= 1: progress.start_task('Predict batch', num_batches) for batch_num, batch in enumerate(batches): if self.options.verbosity + verbosity >= 1: progress.progress(batch_num) batch = list(batch) (c, _p, mask), (_y, ) = self._data_to_arrays(batch, test=True) assert mask.all() # We shouldn't be masking anything in prediction beam_size = 1 if random else self.options.speaker_beam_size done = np.zeros((len(batch), beam_size), dtype=np.bool) beam = np.zeros((len(batch), beam_size, self.seq_vec.max_len), dtype=np.int32) beam[:, :, 0] = self.seq_vec.vectorize(['<s>'])[0] beam_scores = np.log(np.zeros((len(batch), beam_size))) beam_scores[:, 0] = 0.0 c = np.repeat(c, beam_size, axis=0) mask = np.repeat(mask, beam_size, axis=0) for length in range(1, self.seq_vec.max_len): if done.all(): break p = beam.reshape( (beam.shape[0] * beam.shape[1], beam.shape[2]))[:, :-1] probs = self.model.predict([c, p, mask]) if random: indices = sample(probs[:, length - 1, :]) beam[:, 0, length] = indices done = np.logical_or(done, indices == eos_index) else: assert probs.shape[1] == p.shape[1], (probs.shape[1], p.shape[1]) assert probs.shape[2] == len( self.seq_vec.tokens), (probs.shape[2], len(self.seq_vec.tokens)) scores = np.log(probs)[:, length - 1, :].reshape( (beam.shape[0], beam.shape[1], probs.shape[2])) beam_search_step(scores, length, beam, beam_scores, done, eos_index) outputs = self.seq_vec.unvectorize_all(beam[:, 0, :]) result.extend([' '.join(strip_invalid_tokens(o)) for o in outputs]) if self.options.verbosity + verbosity >= 1: progress.end_task() return result
def sample(self, num_samples=1): indices = np.array([[ sample(self.counts.get_value() * 1.0 / self.total.get_value()) for _t in range(self.vec.max_len) ] for _s in range(num_samples)], dtype=np.int32) return [ instance.Instance(' '.join(strip_invalid_tokens(s))) for s in self.vec.unvectorize_all(indices) ]
def predict_and_score(self, eval_instances, random=False, verbosity=0): options = config.options() predictions = [] scores = [] all_utts = self.base.seq_vec.tokens sym_vec = vectorizers.SymbolVectorizer() sym_vec.add_all(all_utts) prior_scores = self.prior_scores(all_utts) base_is_listener = (type(self.base) in listener.LISTENERS.values()) true_batch_size = options.listener_eval_batch_size / len(all_utts) batches = iterators.iter_batches(eval_instances, true_batch_size) num_batches = (len(eval_instances) - 1) // true_batch_size + 1 if options.verbosity + verbosity >= 2: print('Testing') progress.start_task('Eval batch', num_batches) for batch_num, batch in enumerate(batches): progress.progress(batch_num) batch = list(batch) context = len( batch[0].alt_inputs) if batch[0].alt_inputs is not None else 0 if context: output_grid = [ (instance.Instance(utt, color) if base_is_listener else instance.Instance(color, utt)) for inst in batch for color in inst.alt_inputs for utt in sym_vec.tokens ] assert len(output_grid) == context * len(batch) * len(all_utts), \ 'Context must be the same number of colors for all examples' true_indices = np.array([inst.input for inst in batch]) else: output_grid = [ (instance.Instance(utt, inst.input) if base_is_listener else instance.Instance(inst.input, utt)) for inst in batch for utt in sym_vec.tokens ] true_indices = sym_vec.vectorize_all( [inst.input for inst in batch]) if len(true_indices.shape) == 2: # Sequence vectorizer; we're only using single tokens for now. true_indices = true_indices[:, 0] scores = self.base.score(output_grid, verbosity=verbosity) if context: log_probs = np.array(scores).reshape( (len(batch), context, len(all_utts))) orig_log_probs = log_probs[np.arange(len(batch)), true_indices, :] # Renormalize over only the context colors, and extract the score of # the true color. log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis, :] log_probs = log_probs[np.arange(len(batch)), true_indices, :] else: log_probs = np.array(scores).reshape( (len(batch), len(all_utts))) orig_log_probs = log_probs assert log_probs.shape == (len(batch), len(all_utts)) # Add in the prior scores, if used (S1 \propto L0 * P) if prior_scores is not None: log_probs = log_probs + 0.5 * prior_scores if options.exhaustive_base_weight: w = options.exhaustive_base_weight log_probs = w * orig_log_probs + (1.0 - w) * log_probs # Normalize across utterances. Note that the listener returns probability # densities over colors. log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis] if random: pred_indices = sample(np.exp(log_probs)) else: pred_indices = np.argmax(log_probs, axis=1) predictions.extend(sym_vec.unvectorize_all(pred_indices)) scores.extend(log_probs[np.arange(len(batch)), true_indices].tolist()) progress.end_task() return predictions, scores
def predict_and_score(self, eval_instances, random=False, verbosity=0): options = self.get_options() predictions = [] scores = [] if options.verbosity + verbosity >= 2: print('Building alternative utterance list') sym_vec = vectorizers.SymbolVectorizer() sym_vec.add_all([inst.input for inst in self.get_dataset(self.base)]) assert eval_instances[0].alt_outputs, \ 'Context required for L(S(L)): %s' % eval_instances[0].__dict__ context_len = len(eval_instances[0].alt_outputs) if options.exhaustive_num_samples > 0: num_alt_utts = options.exhaustive_num_samples * context_len + 1 num_sample_sets = options.exhaustive_num_sample_sets else: num_alt_utts = len(sym_vec.tokens) + 1 num_sample_sets = 1 true_batch_size = max( options.listener_eval_batch_size / (num_alt_utts * num_sample_sets * context_len), 1) batches = iterators.iter_batches(eval_instances, true_batch_size) num_batches = (len(eval_instances) - 1) // true_batch_size + 1 if options.exhaustive_output_speaker_samples: self.truncate_utterances_files('s1_samples.%s.jsons', num_sample_sets) if options.exhaustive_output_speaker_predictions: self.truncate_utterances_files('s1_predictions.%s.jsons', num_sample_sets) if options.exhaustive_output_all_grids: self.truncate_utterances_files('grids.%s.jsons.gz', 1) if options.verbosity + verbosity >= 2: print('Testing') progress.start_task('Eval batch', num_batches) for batch_num, batch in enumerate(batches): progress.progress(batch_num) batch = list(batch) output_grid = self.build_grid(batch, sym_vec.tokens) assert len(output_grid) == len(batch) * num_sample_sets * context_len * num_alt_utts, \ 'Context must be the same number of colors for all examples %s' % \ ((len(output_grid), len(batch), num_sample_sets, context_len, num_alt_utts),) true_indices = np.array([inst.output for inst in batch]) grid_scores = self.base.score(output_grid, verbosity=verbosity) l0_log_probs = np.array(grid_scores).reshape( (len(batch), num_sample_sets, context_len, num_alt_utts)) # Renormalize over only the context colors, and extract the score of # the true color according to the base model. l0_log_probs -= logsumexp(l0_log_probs, axis=2)[:, :, np.newaxis, :] assert l0_log_probs.shape == (len(batch), num_sample_sets, context_len, num_alt_utts), l0_log_probs.shape orig_log_probs = l0_log_probs[np.arange(len(batch)), 0, :, 0] assert orig_log_probs.shape == (len(batch), context_len), orig_log_probs.shape # Apply temperature parameter before speaker. utilities = options.exhaustive_inv_temperature * l0_log_probs # Normalize across utterances. Note that the listener returns probability # densities over colors. s1_log_probs = utilities - logsumexp(utilities, axis=3)[:, :, :, np.newaxis] assert s1_log_probs.shape == (len(batch), num_sample_sets, context_len, num_alt_utts), s1_log_probs.shape if options.exhaustive_output_speaker_samples or \ options.exhaustive_output_speaker_predictions: speaker_dist = s1_log_probs[np.arange(len(batch)), :, true_indices, 1:] if options.exhaustive_output_speaker_samples: speaker_sample_indices = sample(np.exp(speaker_dist)) self.write_speaker_utterances('s1_samples.%s.jsons', output_grid, speaker_sample_indices, l0_log_probs.shape) if options.exhaustive_output_speaker_predictions: speaker_pred_indices = np.argmax(speaker_dist, axis=2) self.write_speaker_utterances('s1_predictions.%s.jsons', output_grid, speaker_pred_indices, l0_log_probs.shape) # Normalize again across context colors. l2_log_probs = s1_log_probs - logsumexp( s1_log_probs, axis=2)[:, :, np.newaxis, :] assert l2_log_probs.shape == (len(batch), num_sample_sets, context_len, num_alt_utts), l2_log_probs.shape # Extract the score of each color for the input utterance according to the L2 model. log_probs = l2_log_probs[np.arange(len(batch)), :, :, 0] assert log_probs.shape == (len(batch), num_sample_sets, context_len), log_probs.shape # Blend L0 and L2 (if enabled) to produce final score. if options.exhaustive_base_weight: w = options.exhaustive_base_weight # Bump zero probabilities up to epsilon ~= 3e-23, because previously we would # only have -inf log probs, but now if w < 0 we could get NaNs. log_probs = ( w * np.maximum(orig_log_probs[:, np.newaxis, :], -52.0) + (1.0 - w) * np.maximum(log_probs, -52.0)) # Normalize across context one more time to prevent cheating when # blending. log_probs -= logsumexp(log_probs, axis=2)[:, :, np.newaxis] # Average (in probability space) over sample sets log_probs = logsumexp(log_probs, axis=1) - np.log( log_probs.shape[1]) if options.exhaustive_output_all_grids: self.write_grids(output_grid, l0_log_probs, s1_log_probs, l2_log_probs, log_probs) if random: pred_indices = sample(np.exp(log_probs)) else: pred_indices = np.argmax(log_probs, axis=1) predictions.extend(pred_indices) # Extract the score of the true color according to the combined model. scores.extend(log_probs[np.arange(len(batch)), true_indices].tolist()) progress.end_task() return predictions, scores
def sample(self, num_samples=1): indices = np.array([[sample(self.counts.get_value() * 1.0 / self.total.get_value()) for _t in range(self.vec.max_len)] for _s in range(num_samples)], dtype=np.int32) return [instance.Instance(' '.join(strip_invalid_tokens(s))) for s in self.vec.unvectorize_all(indices)]
def predict_and_score(self, eval_instances, random=False, verbosity=0): options = config.options() predictions = [] scores = [] all_utts = self.base.seq_vec.tokens sym_vec = vectorizers.SymbolVectorizer() sym_vec.add_all(all_utts) prior_scores = self.prior_scores(all_utts) base_is_listener = (type(self.base) in listener.LISTENERS.values()) true_batch_size = options.listener_eval_batch_size / len(all_utts) batches = iterators.iter_batches(eval_instances, true_batch_size) num_batches = (len(eval_instances) - 1) // true_batch_size + 1 if options.verbosity + verbosity >= 2: print('Testing') progress.start_task('Eval batch', num_batches) for batch_num, batch in enumerate(batches): progress.progress(batch_num) batch = list(batch) context = len(batch[0].alt_inputs) if batch[0].alt_inputs is not None else 0 if context: output_grid = [(instance.Instance(utt, color) if base_is_listener else instance.Instance(color, utt)) for inst in batch for color in inst.alt_inputs for utt in sym_vec.tokens] assert len(output_grid) == context * len(batch) * len(all_utts), \ 'Context must be the same number of colors for all examples' true_indices = np.array([inst.input for inst in batch]) else: output_grid = [(instance.Instance(utt, inst.input) if base_is_listener else instance.Instance(inst.input, utt)) for inst in batch for utt in sym_vec.tokens] true_indices = sym_vec.vectorize_all([inst.input for inst in batch]) if len(true_indices.shape) == 2: # Sequence vectorizer; we're only using single tokens for now. true_indices = true_indices[:, 0] scores = self.base.score(output_grid, verbosity=verbosity) if context: log_probs = np.array(scores).reshape((len(batch), context, len(all_utts))) orig_log_probs = log_probs[np.arange(len(batch)), true_indices, :] # Renormalize over only the context colors, and extract the score of # the true color. log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis, :] log_probs = log_probs[np.arange(len(batch)), true_indices, :] else: log_probs = np.array(scores).reshape((len(batch), len(all_utts))) orig_log_probs = log_probs assert log_probs.shape == (len(batch), len(all_utts)) # Add in the prior scores, if used (S1 \propto L0 * P) if prior_scores is not None: log_probs = log_probs + 0.5 * prior_scores if options.exhaustive_base_weight: w = options.exhaustive_base_weight log_probs = w * orig_log_probs + (1.0 - w) * log_probs # Normalize across utterances. Note that the listener returns probability # densities over colors. log_probs -= logsumexp(log_probs, axis=1)[:, np.newaxis] if random: pred_indices = sample(np.exp(log_probs)) else: pred_indices = np.argmax(log_probs, axis=1) predictions.extend(sym_vec.unvectorize_all(pred_indices)) scores.extend(log_probs[np.arange(len(batch)), true_indices].tolist()) progress.end_task() return predictions, scores
def predict_and_score(self, eval_instances, random=False, verbosity=0): options = self.get_options() predictions = [] scores = [] if options.verbosity + verbosity >= 2: print('Building alternative utterance list') sym_vec = vectorizers.SymbolVectorizer() sym_vec.add_all([inst.input for inst in self.get_dataset(self.base)]) assert eval_instances[0].alt_outputs, \ 'Context required for L(S(L)): %s' % eval_instances[0].__dict__ context_len = len(eval_instances[0].alt_outputs) if options.exhaustive_num_samples > 0: num_alt_utts = options.exhaustive_num_samples * context_len + 1 num_sample_sets = options.exhaustive_num_sample_sets else: num_alt_utts = len(sym_vec.tokens) + 1 num_sample_sets = 1 true_batch_size = max(options.listener_eval_batch_size / (num_alt_utts * num_sample_sets * context_len), 1) batches = iterators.iter_batches(eval_instances, true_batch_size) num_batches = (len(eval_instances) - 1) // true_batch_size + 1 if options.exhaustive_output_speaker_samples: self.truncate_utterances_files('s1_samples.%s.jsons', num_sample_sets) if options.exhaustive_output_speaker_predictions: self.truncate_utterances_files('s1_predictions.%s.jsons', num_sample_sets) if options.exhaustive_output_all_grids: self.truncate_utterances_files('grids.%s.jsons.gz', 1) if options.verbosity + verbosity >= 2: print('Testing') progress.start_task('Eval batch', num_batches) for batch_num, batch in enumerate(batches): progress.progress(batch_num) batch = list(batch) output_grid = self.build_grid(batch, sym_vec.tokens) assert len(output_grid) == len(batch) * num_sample_sets * context_len * num_alt_utts, \ 'Context must be the same number of colors for all examples %s' % \ ((len(output_grid), len(batch), num_sample_sets, context_len, num_alt_utts),) true_indices = np.array([inst.output for inst in batch]) grid_scores = self.base.score(output_grid, verbosity=verbosity) l0_log_probs = np.array(grid_scores).reshape((len(batch), num_sample_sets, context_len, num_alt_utts)) # Renormalize over only the context colors, and extract the score of # the true color according to the base model. l0_log_probs -= logsumexp(l0_log_probs, axis=2)[:, :, np.newaxis, :] assert l0_log_probs.shape == (len(batch), num_sample_sets, context_len, num_alt_utts), l0_log_probs.shape orig_log_probs = l0_log_probs[np.arange(len(batch)), 0, :, 0] assert orig_log_probs.shape == (len(batch), context_len), orig_log_probs.shape # Apply temperature parameter before speaker. utilities = options.exhaustive_inv_temperature * l0_log_probs # Normalize across utterances. Note that the listener returns probability # densities over colors. s1_log_probs = utilities - logsumexp(utilities, axis=3)[:, :, :, np.newaxis] assert s1_log_probs.shape == (len(batch), num_sample_sets, context_len, num_alt_utts), s1_log_probs.shape if options.exhaustive_output_speaker_samples or \ options.exhaustive_output_speaker_predictions: speaker_dist = s1_log_probs[np.arange(len(batch)), :, true_indices, 1:] if options.exhaustive_output_speaker_samples: speaker_sample_indices = sample(np.exp(speaker_dist)) self.write_speaker_utterances('s1_samples.%s.jsons', output_grid, speaker_sample_indices, l0_log_probs.shape) if options.exhaustive_output_speaker_predictions: speaker_pred_indices = np.argmax(speaker_dist, axis=2) self.write_speaker_utterances('s1_predictions.%s.jsons', output_grid, speaker_pred_indices, l0_log_probs.shape) # Normalize again across context colors. l2_log_probs = s1_log_probs - logsumexp(s1_log_probs, axis=2)[:, :, np.newaxis, :] assert l2_log_probs.shape == (len(batch), num_sample_sets, context_len, num_alt_utts), l2_log_probs.shape # Extract the score of each color for the input utterance according to the L2 model. log_probs = l2_log_probs[np.arange(len(batch)), :, :, 0] assert log_probs.shape == (len(batch), num_sample_sets, context_len), log_probs.shape # Blend L0 and L2 (if enabled) to produce final score. if options.exhaustive_base_weight: w = options.exhaustive_base_weight # Bump zero probabilities up to epsilon ~= 3e-23, because previously we would # only have -inf log probs, but now if w < 0 we could get NaNs. log_probs = (w * np.maximum(orig_log_probs[:, np.newaxis, :], -52.0) + (1.0 - w) * np.maximum(log_probs, -52.0)) # Normalize across context one more time to prevent cheating when # blending. log_probs -= logsumexp(log_probs, axis=2)[:, :, np.newaxis] # Average (in probability space) over sample sets log_probs = logsumexp(log_probs, axis=1) - np.log(log_probs.shape[1]) if options.exhaustive_output_all_grids: self.write_grids(output_grid, l0_log_probs, s1_log_probs, l2_log_probs, log_probs) if random: pred_indices = sample(np.exp(log_probs)) else: pred_indices = np.argmax(log_probs, axis=1) predictions.extend(pred_indices) # Extract the score of the true color according to the combined model. scores.extend(log_probs[np.arange(len(batch)), true_indices].tolist()) progress.end_task() return predictions, scores