def _test(self, loader, **kwargs): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ if self.ema: self.sess.run(self.ema_load) use_dataset = kwargs.get('dataset', True) cm = ConfusionMatrix(self.model.labels) steps = len(loader) total_loss = 0 total_norm = 0 verbose = kwargs.get("verbose", None) pg = create_progress_bar(steps) for i, batch_dict in enumerate(pg(loader)): y = batch_dict['y'] if use_dataset: guess, lossv = self.sess.run([self.model.best, self.test_loss]) else: feed_dict = self.model.make_input(batch_dict, False) guess, lossv = self.sess.run([self.model.best, self.test_loss], feed_dict=feed_dict) batchsz = len(guess) total_loss += lossv * batchsz total_norm += batchsz cm.add_batch(y, guess) metrics = cm.get_all_metrics() metrics['avg_loss'] = total_loss / float(total_norm) verbose_output(verbose, cm) return metrics
def _test(self, ts, **kwargs): self.model.eval() total_sum = 0 total_correct = 0 gold_spans = [] pred_spans = [] cm = ConfusionMatrix(self.idx2classlabel) metrics = {} steps = len(ts) conll_output = kwargs.get('conll_output', None) txts = kwargs.get('txts', None) handle = None if conll_output is not None and txts is not None: handle = open(conll_output, "w") pg = create_progress_bar(steps) for batch_dict in pg(ts): inputs = self.model.make_input(batch_dict) y = inputs.pop('y') lengths = inputs['lengths'] ids = inputs['ids'] class_labels = inputs["class_label"] with torch.no_grad(): class_pred, pred = self.model(inputs) correct, count, golds, guesses = self.process_output( pred, y.data, lengths, ids, handle, txts) total_correct += correct total_sum += count gold_spans.extend(golds) pred_spans.extend(guesses) _add_to_cm(cm, class_labels, class_pred) total_acc = total_correct / float(total_sum) metrics['tagging_acc'] = total_acc metrics['tagging_f1'] = span_f1(gold_spans, pred_spans) metrics.update({ f"classification_{k}": v for k, v in cm.get_all_metrics().items() }) if self.verbose: # TODO: Add programmatic access to these metrics? conll_metrics = per_entity_f1(gold_spans, pred_spans) conll_metrics['acc'] = total_acc * 100 conll_metrics['tokens'] = total_sum.item() logger.info(conlleval_output(conll_metrics)) return metrics
def _test(self, loader, steps=0, **kwargs): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ cm = ConfusionMatrix(self.model.labels) total_loss = 0 total_norm = 0 verbose = kwargs.get("verbose", None) pg = create_progress_bar(steps) SET_TRAIN_FLAG(False) for features, y in pg(loader): logits = self.model(features) y_ = tf.argmax(logits, axis=1, output_type=tf.int32) cm.add_batch(y, y_) lossv = tf.compat.v1.losses.sparse_softmax_cross_entropy( labels=y, logits=logits).numpy() batchsz = int(y.shape[0]) assert len(y_) == batchsz total_loss += lossv * batchsz total_norm += batchsz cm.add_batch(y, y_) metrics = cm.get_all_metrics() metrics['avg_loss'] = total_loss / float(total_norm) verbose_output(verbose, cm) return metrics
def _test(self, loader, **kwargs): self.model.eval() total_loss = 0 total_norm = 0 steps = len(loader) pg = create_progress_bar(steps) cm = ConfusionMatrix(self.labels) verbose = kwargs.get("verbose", None) output = kwargs.get('output') txts = kwargs.get('txts') handle = None line_number = 0 if output is not None and txts is not None: handle = open(output, "w") with torch.no_grad(): for batch_dict in pg(loader): example = self._make_input(batch_dict) ys = example.pop('y') pred = self.model(example) loss = self.crit(pred, ys) if handle is not None: for p, y in zip(pred, ys): handle.write('{}\t{}\t{}\n'.format( " ".join(txts[line_number]), self.model.labels[p], self.model.labels[y])) line_number += 1 batchsz = self._get_batchsz(batch_dict) total_loss += loss.item() * batchsz total_norm += batchsz _add_to_cm(cm, ys, pred) metrics = cm.get_all_metrics() metrics['avg_loss'] = total_loss / float(total_norm) verbose_output(verbose, cm) if handle is not None: handle.close() return metrics
def _train(self, loader, **kwargs): self.model.train() reporting_fns = kwargs.get('reporting_fns', []) steps = len(loader) pg = create_progress_bar(steps) cm = ConfusionMatrix(self.labels) epoch_loss = 0 epoch_div = 0 for batch_dict in pg(loader): self.optimizer.zero_grad() example = self._make_input(batch_dict) y = example.pop('y') pred = self.model(example) loss = self.crit(pred, y) batchsz = self._get_batchsz(batch_dict) report_loss = loss.item() * batchsz epoch_loss += report_loss epoch_div += batchsz self.nstep_agg += report_loss self.nstep_div += batchsz loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) _add_to_cm(cm, y, pred) self.optimizer.step() if (self.optimizer.global_step + 1) % self.nsteps == 0: metrics = self.calc_metrics(self.nstep_agg, self.nstep_div) metrics['lr'] = self.optimizer.current_lr self.report(self.optimizer.global_step + 1, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps) self.reset_nstep() metrics = cm.get_all_metrics() metrics['lr'] = self.optimizer.current_lr metrics['avg_loss'] = epoch_loss / float(epoch_div) return metrics
return loss_value for epoch in range(num_epochs): loss_acc = 0. step = 0 start = time.time() for x, y in train_set.get_input(training=True): loss_value = train_step(optimizer, model, x, y) loss_acc += loss_value step += 1 print('training time {}'.format(time.time() - start)) mean_loss = loss_acc / step print('Training Loss {}'.format(mean_loss)) cm = ConfusionMatrix(['0', '1']) for x, y in valid_set.get_input(): y_ = np.argmax(to_device(model(x)), axis=1) cm.add_batch(y, y_) print(cm) print(cm.get_all_metrics()) print('FINAL') cm = ConfusionMatrix(['0', '1']) for x, y in test_set.get_input(): y_ = tf.argmax(to_device(model(x)), axis=1, output_type=tf.int32) cm.add_batch(y, y_) print(cm) print(cm.get_all_metrics())
class JointTaggerEvaluatorEagerTf: """Performs evaluation on joint tagger and classifier output """ def __init__(self, model, span_type, verbose): """Construct from an existing model :param model: A model :param span_type: (`str`) The span type :param verbose: (`bool`) Be verbose? """ self.model = model self.idx2label = revlut(model.labels["tags"]) self.idx2classlabel = revlut(model.labels["class_labels"]) self.cm = None self.span_type = span_type if verbose: print('Setting span type {}'.format(self.span_type)) self.verbose = verbose def process_batch(self, batch, truth, handle=None, txts=None, ids=None): class_guess_logits, guess = self.model(batch) sentence_lengths = batch['lengths'] true_class_labels = batch['class_label'] correct_labels = 0 total_labels = 0 # For fscore gold_chunks = [] pred_chunks = [] actual_class_labels = [] predicted_class_labels = [] # For each sentence for b in range(len(guess)): length = sentence_lengths[b] sentence = guess[b][:length].numpy() # truth[b] is padded, cutting at :length gives us back true length gold = truth[b][:length].numpy() actual_class_labels.append(true_class_labels[b].numpy()) predicted_class_labels.append( tf.argmax(class_guess_logits[b], axis=0, output_type=tf.int32)) valid_guess = sentence[gold != Offsets.PAD] valid_gold = gold[gold != Offsets.PAD] valid_sentence_length = np.sum(gold != Offsets.PAD) correct_labels += np.sum(np.equal(valid_guess, valid_gold)) total_labels += valid_sentence_length gold_chunks.append( set( to_spans(valid_gold, self.idx2label, self.span_type, self.verbose))) pred_chunks.append( set( to_spans(valid_guess, self.idx2label, self.span_type, self.verbose))) if not (handle is None or txts is None): example_id = ids[b] example_txt = txts[example_id] write_sentence_conll(handle, valid_guess, valid_gold, example_txt, self.idx2label) self.cm.add_batch(actual_class_labels, predicted_class_labels) return correct_labels, total_labels, gold_chunks, pred_chunks def test(self, ts, steps=0, **kwargs): """Method that evaluates on some data. There are 2 modes this can run in, `feed_dict` and `dataset` In `feed_dict` mode, the model cycles the test data batch-wise and feeds each batch in with a `feed_dict`. In `dataset` mode, the data is still passed in to this method, but it is not passed in a `feed_dict` and is mostly superfluous since the features are grafted right onto the graph. However, we do use it for supplying the ground truth, ids and text, so it is essential that the caller does not shuffle the data :param ts: The test set :param conll_output: (`str`) An optional file output :param txts: A list of text data associated with the encoded batch :param dataset: (`bool`) Is this using `tf.dataset`s :return: The metrics """ SET_TRAIN_FLAG(False) total_correct = total_sum = 0 gold_spans = [] pred_spans = [] self.cm = ConfusionMatrix(self.idx2classlabel) handle = None if kwargs.get("conll_output") is not None and kwargs.get( 'txts') is not None: handle = open(kwargs.get("conll_output"), "w") try: pg = create_progress_bar(steps) metrics = {} for (features, y), batch in pg( zip_longest(ts, kwargs.get('batches', []), fillvalue={})): correct, count, golds, guesses = self.process_batch( features, y, handle=handle, txts=kwargs.get("txts"), ids=batch.get("ids")) total_correct += correct total_sum += count gold_spans.extend(golds) pred_spans.extend(guesses) total_acc = total_correct / float(total_sum) # Only show the fscore if requested metrics['tagging_f1'] = span_f1(gold_spans, pred_spans) metrics['tagging_acc'] = total_acc metrics.update({ f"classification_{k}": v for k, v in self.cm.get_all_metrics().items() }) if self.verbose: conll_metrics = per_entity_f1(gold_spans, pred_spans) conll_metrics['acc'] = total_acc * 100 conll_metrics['tokens'] = total_sum logger.info(conlleval_output(conll_metrics)) finally: if handle is not None: handle.close() return metrics